In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read, pre-process and visualize data

In [None]:
# fetch data 
symmetric = True
if symmetric: 
    data_set = pd.read_csv('../assignments_sp23/letter-recognition.csv')
else: 
    data_set = pd.read_csv('../assignments_sp23/letter-recognition-original.csv')
data_set.head()

In [None]:
len(data_set)

In [None]:
data_set.isna().sum()

#### Symmetric Letter Prediction ####

In [None]:
if symmetric: 
    X_train, X_test, y_train, y_test = train_test_split(data_set.drop(columns = ['Symmetric', 'Capital_letter']), data_set['Symmetric'], test_size=0.2, stratify = data_set['Symmetric'], random_state=50)
    # In the above split the stratify = y essentially makes sure the fractions of the classification is maintained
    X_train
    X_test
    y_train
    y_test

In [None]:
if symmetric: 
    # model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'none')
    # model = LogisticRegression(fit_intercept = True, solver='liblinear', multi_class = 'auto', penalty = 'l1', C = 0.1)
    model = LogisticRegression(fit_intercept = True, solver='liblinear', multi_class = 'auto', penalty = 'l1', C = 10)

    model.fit(X_train, y_train) 

    # The following gives the mean accuracy on the given data and labels
    model.score(X_train, y_train) 

    # This is the coefficient Beta_1, ..., Beta_7
    model.coef_

    # This is the coefficient Beta_0
    model.intercept_

In [None]:
if symmetric: 
    print('Percentage of correct predictions is ')
    print(model.score(X_test, y_test))

In [None]:
if symmetric: 
    X_train.columns
    gb = GradientBoostingClassifier(random_state=50, min_samples_split = 12, min_samples_leaf = 6, max_depth = 4, n_estimators = 100)

    gb = gb.fit(X_train, y_train) 
    gb.score(X_train, y_train) 

    # gb.feature_importances_
    feat_imp = pd.Series(gb.feature_importances_, X_train.columns.values).sort_values(ascending=False)

    feat_imp_table = pd.DataFrame(feat_imp)
    feat_imp_table = feat_imp_table.reset_index()
    feat_imp_table.columns = ['Features', 'Values']
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.figure(figsize=[40,20], dpi = 50)
    feat_imp.head(12)

    test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_Y'])

    test_output.head()
    test_output = test_output.merge(y_test, left_index = True, right_index = True)
    test_output.head()
    print('Fraction of correct classification ')
    gb.score(X_test, y_test) 

In [None]:
if symmetric: 
    X_train.columns
    rf = RandomForestClassifier(random_state=50, min_samples_leaf = 6, max_features = "sqrt", n_estimators = 100)

    rf = rf.fit(X_train, y_train) 
    rf.score(X_train, y_train) 

    # rf.feature_importances_
    feat_imp = pd.Series(rf.feature_importances_, X_train.columns.values).sort_values(ascending=False)

    feat_imp_table = pd.DataFrame(feat_imp)
    feat_imp_table = feat_imp_table.reset_index()
    feat_imp_table.columns = ['Features', 'Values']
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.figure(figsize=[40,20], dpi = 50)
    feat_imp.head(12)

    test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_Y'])

    test_output.head()
    test_output = test_output.merge(y_test, left_index = True, right_index = True)
    test_output.head()
    print('Fraction of correct classification ')
    rf.score(X_test, y_test) 

In [None]:
if symmetric: 
    # Create regression matrices

    dtrain_class = xgb.DMatrix(X_train, y_train, enable_categorical=True)

    dtest_class = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
if symmetric: 
    # Define hyperparameters

    # params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
    # Use above if we have GPU
    params = {"objective": "binary:hinge", "tree_method": "exact", "max_depth" : 4, "learning_rate" : 0.4} # use "tree_method" : "hist" if you need speed

In [None]:
if symmetric: 
    n = 100

    model = xgb.train(

       params=params,

       dtrain=dtrain_class,

       num_boost_round=n,

    )

In [None]:
if symmetric: 
    from sklearn.metrics import mean_squared_error
    preds = model.predict(dtest_class)

In [None]:
if symmetric: 
    test_output = pd.DataFrame(preds, index = X_test.index, columns = ['pred_Y'])
    test_output.head()
    test_output = test_output.merge(y_test, left_index = True, right_index = True)
    test_output.head()
    sum(test_output['Symmetric'] == 0)
    sum(test_output['Symmetric'] == 1)
    sum(test_output['pred_Y'] == 0)
    sum(test_output['pred_Y'] == 1)
    sum(test_output['Symmetric'] == test_output['pred_Y'])
    len(test_output)
    sum(test_output['Symmetric'] == test_output['pred_Y'])/len(test_output)

#### Predicting Capital Letter ####

In [None]:
if symmetric: 
    X_train, X_test, y_train, y_test = train_test_split(data_set.drop(columns = ['Symmetric', 'Capital_letter']), data_set['Capital_letter'], test_size=0.2, stratify = data_set['Symmetric'], random_state=50)
else:
    X_train, X_test, y_train, y_test = train_test_split(data_set.drop(columns = ['Capital_letter']), data_set['Capital_letter'], test_size=0.2, stratify = data_set['Capital_letter'], random_state=50)
# In the above split the stratify = y essentially makes sure the fractions of the classification is maintained
X_train
X_test
y_train
y_test


In [None]:
# model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'none')
model = LogisticRegression(fit_intercept = True, solver='liblinear', multi_class = 'auto', penalty = 'l1', C = 0.1)
# model = LogisticRegression(fit_intercept = True, solver='liblinear', multi_class = 'auto', penalty = 'l1', C = 10)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

In [None]:
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))

In [None]:
X_train.columns
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 12, min_samples_leaf = 6, max_depth = 4, n_estimators = 100)

gb = gb.fit(X_train, y_train) 
gb.score(X_train, y_train) 

# gb.feature_importances_
feat_imp = pd.Series(gb.feature_importances_, X_train.columns.values).sort_values(ascending=False)

feat_imp_table = pd.DataFrame(feat_imp)
feat_imp_table = feat_imp_table.reset_index()
feat_imp_table.columns = ['Features', 'Values']
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.figure(figsize=[40,20], dpi = 50)
feat_imp.head(12)

test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_Y'])

test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
gb.score(X_test, y_test) 

In [None]:
X_train.columns
rf = RandomForestClassifier(random_state=50, min_samples_leaf = 6, max_features = "sqrt", n_estimators = 100)

rf = rf.fit(X_train, y_train) 
rf.score(X_train, y_train) 

# rf.feature_importances_
feat_imp = pd.Series(rf.feature_importances_, X_train.columns.values).sort_values(ascending=False)

feat_imp_table = pd.DataFrame(feat_imp)
feat_imp_table = feat_imp_table.reset_index()
feat_imp_table.columns = ['Features', 'Values']
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.figure(figsize=[40,20], dpi = 50)
feat_imp.head(12)

test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_Y'])

test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
rf.score(X_test, y_test) 
