In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

main_data = pd.read_csv('Kaggle_Data/Stars.csv')
main_data.head()

In [None]:
main_data.dtypes

In [None]:
coded_data = pd.get_dummies(main_data, drop_first = True)
coded_data.head()

In [None]:
len(coded_data)
set(coded_data['Type'])

In [None]:
X0 = coded_data[coded_data['Type'] == 0]
X1 = coded_data[coded_data['Type'] == 1]
X2 = coded_data[coded_data['Type'] == 2]
X3 = coded_data[coded_data['Type'] == 3]
X4 = coded_data[coded_data['Type'] == 4]
X5 = coded_data[coded_data['Type'] == 5]

for col in coded_data.drop(columns=['Type']).columns: 
    plt.scatter(X0[col], X0['Type'], color = 'red', marker = 'o', label = 'Type 0')
    plt.scatter(X1[col], X1['Type'], color = 'blue', marker = '<', label = 'Type 1')
    plt.scatter(X2[col], X2['Type'], color = 'orange', marker = 'd', label = 'Type 2')
    plt.scatter(X3[col], X3['Type'], color = 'cyan', marker = 'x', label = 'Type 3')
    plt.scatter(X4[col], X4['Type'], color = 'black', marker = 'v', label = 'Type 4')
    plt.scatter(X5[col], X5['Type'], color = 'green', marker = '+', label = 'Type 5')
    plt.xlabel(col)
    plt.ylabel('Type')
    # plt.legend(loc='center right')
    plt.show()

In [None]:
# admission_data.isna().sum()
coded_data.isna().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(coded_data.drop(columns = ['Type']), coded_data['Type'], test_size=0.2, stratify = coded_data['Type'], random_state=50)
# In the above split the stratify = y essentially makes sure the fractions of the classification is maintained
X_train
X_test
y_train
y_test

In [None]:
model = LogisticRegression(fit_intercept = True, solver='newton-cg', multi_class = 'multinomial', penalty = 'none', max_iter = 1000)
# model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'ovr', penalty = 'none', max_iter = 1000)
# model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'multinomial', penalty = 'none', max_iter = 1000)
# model = LogisticRegression(fit_intercept = True, solver='newton-cg', multi_class = 'multinomial', penalty = 'none', max_iter = 10000)

# While using multiclass case do multi_class = 'ovr' or 'auto'; can also try other solvers
# While doing regularization, use penalty = 'l2' and also C = 10.0 (need to try other values too)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

In [None]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Type'])
test_output.head()

In [None]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))

In [None]:
test_output = test_output.merge(X_test, left_index = True, right_index = True)
test_output.head()

In [None]:
model.predict_proba(coded_data.drop(columns = ['Type']))

In [None]:
pd.DataFrame(model.predict_proba(coded_data.drop(columns = ['Type'])), columns = [0, 1, 2, 3, 4, 5])

In [None]:
pd.DataFrame(model.predict_proba(coded_data.drop(columns = ['Type'])), columns = [0, 1, 2, 3, 4, 5]).max(axis = 1)
pd.DataFrame(model.predict_proba(coded_data.drop(columns = ['Type'])), columns = [0, 1, 2, 3, 4, 5]).idxmax(axis = 1)

In [None]:
data_with_prob = X_train.copy()
data_with_prob['Type'] = y_train
# Next we give the probability of predicting 1 (in multiclass, there will be probabilities by class)
data_with_prob['Probability'] = model.predict_proba(data_with_prob.drop(columns = ['Type'])).max(axis = 1)

# Notice that we are changing the probability to the one that was predicted
# data_with_prob['Probability'] = model.predict_proba(data_with_prob.drop(columns = ['Type']))[:,1]

data_with_prob.head()

In [None]:
test_output['Probability'] = model.predict_proba(test_output.drop(columns = ['Type', 'pred_Type'])).max(axis = 1)
# Sane as abive here too
# test_output['Probability'] = model.predict_proba(test_output.drop(columns = ['Type', 'pred_Type']))[:,1]

test_output.head()