In [0]:
# Importing libraries and modules
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import metrics

import matplotlib.pyplot as plt

# !pip install category_encoders

In [138]:
# Loading data from local machine
columns = ['code_module', 'code_presentation', 'gender', 'region', 'highest_education', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result']

# from google.colab import files
# import io
# uploaded = files.upload()
# df = pd.read_csv(io.BytesIO(uploaded['studentInfo.csv']), names=columns)
df = pd.read_csv("https://raw.githubusercontent.com/prabinspkt/Machine-Learning-Project/master/random_sample.csv", usecols=columns)
# .replace(np.NaN, 'unknown')
df.head()

Unnamed: 0,code_module,code_presentation,gender,region,highest_education,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,FFF,2014J,M,Scotland,Post Graduate Qualification,0-35,0,120,N,Pass
1,DDD,2013J,F,London Region,A Level or Equivalent,0-35,1,60,N,Withdrawn
2,BBB,2014J,F,Wales,A Level or Equivalent,0-35,0,60,N,Withdrawn
3,BBB,2014J,F,Yorkshire Region,A Level or Equivalent,0-35,0,120,N,Distinction
4,BBB,2013B,F,North Region,Lower Than A Level,0-35,0,60,N,Fail


In [139]:
# Understanding data
df.isna().sum()

code_module             0
code_presentation       0
gender                  0
region                  0
highest_education       0
age_band                0
num_of_prev_attempts    0
studied_credits         0
disability              0
final_result            0
dtype: int64

In [140]:
# Feature Engineering and (One hot encoding)
# Change the label values into four integer values, to be used for prediction later
df['final_result'] = df['final_result'].replace(['Distinction'], 10000)
df['final_result'] = df['final_result'].replace(['Pass'], 5000)
df['final_result'] = df['final_result'].replace(['Withdrawn'], 1000)
df['final_result'] = df['final_result'].replace(['Fail'], 0)

print(df['final_result'].unique())

[ 5000  1000 10000     0]


In [141]:
df.head()

Unnamed: 0,code_module,code_presentation,gender,region,highest_education,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,FFF,2014J,M,Scotland,Post Graduate Qualification,0-35,0,120,N,5000
1,DDD,2013J,F,London Region,A Level or Equivalent,0-35,1,60,N,1000
2,BBB,2014J,F,Wales,A Level or Equivalent,0-35,0,60,N,1000
3,BBB,2014J,F,Yorkshire Region,A Level or Equivalent,0-35,0,120,N,10000
4,BBB,2013B,F,North Region,Lower Than A Level,0-35,0,60,N,0


In [142]:
# Shuffle and Train Test Split
df = shuffle(df)
df.head()

Unnamed: 0,code_module,code_presentation,gender,region,highest_education,age_band,num_of_prev_attempts,studied_credits,disability,final_result
240,BBB,2014B,F,South West Region,A Level or Equivalent,35-55,0,60,N,10000
40,DDD,2014J,F,Wales,A Level or Equivalent,35-55,0,120,N,1000
12,DDD,2014B,F,South East Region,A Level or Equivalent,0-35,1,120,N,5000
285,FFF,2013J,M,South East Region,Lower Than A Level,35-55,0,60,N,10000
261,CCC,2014J,M,East Anglian Region,A Level or Equivalent,0-35,0,60,N,1000


In [143]:
# One hot encoding
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df[['code_module']])

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [144]:
ohe.categories_

[array(['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG'], dtype=object)]

In [0]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['code_module', 'code_presentation', 'gender', 'region', 'highest_education', 'age_band', 'disability']),
    remainder='passthrough')

In [0]:
# Separete target and other columns used to predict target
Y = df.final_result
X = df.drop('final_result', axis='columns')

In [0]:
data_matrix = column_trans.fit_transform(X)

In [0]:
# Train and test split
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=4)
# X_train = column_trans.fit_transform(X)[:300]
# X_test = column_trans.fit_transform(X)[300:]

X_train = data_matrix[:300]
X_test = data_matrix[300:]

Y_train = Y.to_numpy()[:300]
Y_test = Y.to_numpy()[300:]

In [149]:
# Train model
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [150]:
# Test model
Y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(Y_test, Y_pred))

0.3619047619047619


In [0]:
# Draw observations and conclusions from model and data