In [6]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skl
sns.set()

In [7]:
df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")

In [8]:
merge_df = [df_test, df_train]

In [9]:
df = pd.concat(merge_df)

In [10]:
df = df.drop('id', axis=1)

In [11]:
df

Unnamed: 0,gender,group,preparation rating,CSE1203 Discrete Mathematics score,EEE2142 Electronic Devices and Circuits' score,CSE2200 Software Development I's score,MATH2203 Mathematics IV's score,Admission Fees,Education Level of Parents
0,male,A3,5,67,73,68,58,standard,ED
1,male,A3,5,76,80,73,74,standard,ED
2,female,group E,2,87,94,95,87,standard,associate's degree
3,female,A2,3,82,85,87,97,standard,CD
4,female,A3,4,73,76,78,68,standard,CD
...,...,...,...,...,...,...,...,...,...
795,female,group E,5,57,68,73,77,free,associate's degree
796,male,A4,2,70,70,70,56,standard,ED
797,female,group E,0,70,84,81,91,free,associate's degree
798,male,group E,0,69,60,54,51,standard,CD


In [12]:
df = df.drop('gender', axis=1)

In [13]:
df = df.drop('group', axis=1)

In [14]:
df = df.drop('Admission Fees', axis=1)

In [15]:
df = df.drop('Education Level of Parents', axis=1)

In [16]:
df.columns=df.columns.str.replace("'",'')

In [17]:
df.rename(columns={'preparation rating': 'target', 'CSE1203 Discrete Mathematics score': 'CSE_dis_math_score', 
                   'EEE2142 Electronic Devices and Circuits score': 'EEE_elctrnc_circuits_score', 
                   'CSE2200 Software Development Is score': 'CSE_softwr_devlplmnt_score', 
                   'MATH2203 Mathematics IVs score': 'Mathematics_ivs_score'}, inplace=True)

In [18]:
df

Unnamed: 0,target,CSE_dis_math_score,EEE_elctrnc_circuits_score,CSE_softwr_devlplmnt_score,Mathematics_ivs_score
0,5,67,73,68,58
1,5,76,80,73,74
2,2,87,94,95,87
3,3,82,85,87,97
4,4,73,76,78,68
...,...,...,...,...,...
795,5,57,68,73,77
796,2,70,70,70,56
797,0,70,84,81,91
798,0,69,60,54,51


In [14]:
df['CSE_dis_math_score'].value_counts

<bound method IndexOpsMixin.value_counts of 0      67
1      76
2      87
3      82
4      73
       ..
795    57
796    70
797    70
798    69
799    52
Name: CSE_dis_math_score, Length: 1000, dtype: int64>

In [15]:
df['CSE_dis_math_score'].unique()

array([ 67,  76,  87,  82,  73,  75,  64,  41,  90,  59,  51,  45,  54,
        72,  94,  61,  60,  77,  85,  78,  49,  71,  48,  62,  56,  65,
        69,  68,  74,  58,  66,  39,  23,  40,  91,  80,  97,  52,  53,
        81,  70,  43,  57,  63,  55,  93,  44,  86,  50,  88,  32,  36,
        34,  79,  47,  46, 100,  35,  98,  84,  92,  89,   8,  38,  18,
        33,   0,  30,  42,  27,  99,  83,  22,  96,  37,  28,  24,  26,
        95,  29,  19], dtype=int64)

In [16]:
df['CSE_dis_math_score'].describe()

count    1000.00000
mean       66.08900
std        15.16308
min         0.00000
25%        57.00000
50%        66.00000
75%        77.00000
max       100.00000
Name: CSE_dis_math_score, dtype: float64

In [17]:
df['CSE_dis_math_score'] = df['CSE_dis_math_score'].replace(0,df['CSE_dis_math_score'].mean())

In [18]:
df['CSE_dis_math_score'].describe()

count    1000.000000
mean       66.155089
std        15.018072
min         8.000000
25%        57.000000
50%        66.000000
75%        77.000000
max       100.000000
Name: CSE_dis_math_score, dtype: float64

In [19]:
df['CSE_dis_math_score'].unique()

array([ 67.   ,  76.   ,  87.   ,  82.   ,  73.   ,  75.   ,  64.   ,
        41.   ,  90.   ,  59.   ,  51.   ,  45.   ,  54.   ,  72.   ,
        94.   ,  61.   ,  60.   ,  77.   ,  85.   ,  78.   ,  49.   ,
        71.   ,  48.   ,  62.   ,  56.   ,  65.   ,  69.   ,  68.   ,
        74.   ,  58.   ,  66.   ,  39.   ,  23.   ,  40.   ,  91.   ,
        80.   ,  97.   ,  52.   ,  53.   ,  81.   ,  70.   ,  43.   ,
        57.   ,  63.   ,  55.   ,  93.   ,  44.   ,  86.   ,  50.   ,
        88.   ,  32.   ,  36.   ,  34.   ,  79.   ,  47.   ,  46.   ,
       100.   ,  35.   ,  98.   ,  84.   ,  92.   ,  89.   ,   8.   ,
        38.   ,  18.   ,  33.   ,  66.089,  30.   ,  42.   ,  27.   ,
        99.   ,  83.   ,  22.   ,  96.   ,  37.   ,  28.   ,  24.   ,
        26.   ,  95.   ,  29.   ,  19.   ])

In [20]:
df['EEE_elctrnc_circuits_score'].unique()

array([ 73,  80,  94,  85,  76,  81,  74,  45,  75,  54,  31,  47,  64,
        84,  86,  59,  70,  72,  91,  90,  52,  87,  58,  67,  68,  69,
        60,  82,  50,  57,  77,  44,  65,  97,  79,  49,  62,  42,  71,
        93,  53,  51,  61,  46, 100,  92,  34,  29,  78,  48,  96,  66,
        56,  43,  63,  41,  95,  24,  99,  55,  83,  89,  32,  17,  39,
        37,  26,  88,  28,  23,  38,  40], dtype=int64)

In [21]:
df['CSE_softwr_devlplmnt_score'].unique()

array([ 68,  73,  95,  87,  78,  74,  75,  40,  69,  51,  36,  49,  67,
        76,  83,  64,  88,  92,  93,  82,  52,  58,  70,  81,  53,  57,
        89,  45,  79,  46,  84,  85,  50,  63,  61,  55,  96,  65,  72,
        38,  80,  91,  41,  47,  66,  54,  71,  44,  59,  48,  27, 100,
        77,  60,  62,  90,  94,  23,  86,  39,  43,  28,  42,  10,  34,
        37,  56,  22,  98,  33,  19,  35,  32,  97,  99,  15,  30],
      dtype=int64)

In [22]:
df['Mathematics_ivs_score'].unique()

array([ 58,  74,  87,  97,  68,  69,  86,  26,  81,  52,  24,  51,  62,
        95,  63,  59,  75,  78,  92,  40,  54,  65,  93,  56,  60,  96,
        55,  90,  47,  89,  85,  35,  23,  64,  61,  72,  80,  71,  98,
        70,  37,  25, 100,  49,  77,  76,  83,  50,  53,  41,  36,  79,
        33,  44,  66,  57,  45,  73,  46,  48,  38,  88,  84,  99,  39,
        29,  94,  42,  14,  43,  67,   1,  21,  20,  91,  82,  28,  34,
        31,  18,  30], dtype=int64)

In [23]:
df['Mathematics_ivs_score'].describe()

count    1000.00000
mean       67.04400
std        16.70704
min         1.00000
25%        56.00000
50%        68.00000
75%        80.00000
max       100.00000
Name: Mathematics_ivs_score, dtype: float64

In [24]:
df['Mathematics_ivs_score'] = df['Mathematics_ivs_score'].replace(1,df['Mathematics_ivs_score'].mean())

In [25]:
df['Mathematics_ivs_score'].describe()

count    1000.000000
mean       67.110044
std        16.575725
min        14.000000
25%        56.000000
50%        68.000000
75%        80.000000
max       100.000000
Name: Mathematics_ivs_score, dtype: float64

In [26]:
df['Mathematics_ivs_score'].unique()

array([ 58.   ,  74.   ,  87.   ,  97.   ,  68.   ,  69.   ,  86.   ,
        26.   ,  81.   ,  52.   ,  24.   ,  51.   ,  62.   ,  95.   ,
        63.   ,  59.   ,  75.   ,  78.   ,  92.   ,  40.   ,  54.   ,
        65.   ,  93.   ,  56.   ,  60.   ,  96.   ,  55.   ,  90.   ,
        47.   ,  89.   ,  85.   ,  35.   ,  23.   ,  64.   ,  61.   ,
        72.   ,  80.   ,  71.   ,  98.   ,  70.   ,  37.   ,  25.   ,
       100.   ,  49.   ,  77.   ,  76.   ,  83.   ,  50.   ,  53.   ,
        41.   ,  36.   ,  79.   ,  33.   ,  44.   ,  66.   ,  57.   ,
        45.   ,  73.   ,  46.   ,  48.   ,  38.   ,  88.   ,  84.   ,
        99.   ,  39.   ,  29.   ,  94.   ,  42.   ,  14.   ,  43.   ,
        67.   ,  67.044,  21.   ,  20.   ,  91.   ,  82.   ,  28.   ,
        34.   ,  31.   ,  18.   ,  30.   ])

In [27]:
X = df.drop(columns = ['target'])
y = df['target']

In [28]:
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

In [29]:
X_scaled

array([[ 0.05628777,  0.26252511, -0.00355543, -0.54987656],
       [ 0.65586561,  0.74221076,  0.3256506 ,  0.41587344],
       [ 1.38868297,  1.70158207,  1.77415712,  1.20054531],
       ...,
       [ 0.25614705,  1.01631685,  0.85238025,  1.44198281],
       [ 0.18952729, -0.62831968, -0.9253323 , -0.97239218],
       [-0.94300863, -0.97095229, -0.72780869, -0.12736094]])

In [30]:
x_train,x_test,y_train,y_test = train_test_split(X_scaled,y, test_size= 0.25, random_state = 355)

In [31]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

DecisionTreeClassifier()

In [32]:
dtc.score(x_train,y_train)

0.9986666666666667

In [33]:
dtc.score(x_test,y_test)

0.284

In [34]:
dtc.score(x_test,y_test)

0.284

In [35]:
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [36]:
import graphviz
from sklearn.tree import export_graphviz

In [37]:
grid_search = GridSearchCV(estimator=dtc,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [38]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 32),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'splitter': ['best', 'random']})

In [39]:
best_parameters = grid_search.best_params_
print(best_parameters)

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 4, 'splitter': 'random'}


In [40]:
grid_search.best_score_

0.368

In [41]:
dtc = DecisionTreeClassifier(criterion = 'entropy', max_depth =4, min_samples_leaf= 6, min_samples_split= 4, splitter ='random')
dtc.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=6,
                       min_samples_split=4, splitter='random')

In [42]:
dtc.score(x_test,y_test)

0.376

In [43]:
log_reg = LogisticRegression()

log_reg.fit(x_train,y_train)

LogisticRegression()

In [44]:
y_pred = log_reg.predict(x_test)

In [45]:
y_pred

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5,
       5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 4, 4, 5, 4, 5, 4, 5,
       5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 1, 5, 5, 4, 5, 5, 5,
       5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5,
       5, 4, 4, 5, 5, 5, 5, 4, 4, 5, 4, 5, 5, 5, 5, 5, 5, 4, 4, 5, 5, 5,
       5, 4, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 4, 5, 5, 5, 5, 5, 5, 5, 4, 5, 4, 4, 4, 5, 5, 5, 5, 5, 4, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 4, 4, 5, 4, 5, 5,
       5, 4, 4, 4, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 4, 5, 5, 4,
       5, 5, 5, 5, 5, 5, 4, 1, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4,
       5, 5, 5, 5, 5, 5, 5, 5], dtype=int64)

In [46]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.372

In [47]:
# Confusion Matrix
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[ 0,  0,  0,  0, 10, 26],
       [ 0,  1,  0,  0,  6, 23],
       [ 0,  1,  0,  0, 10, 17],
       [ 0,  1,  0,  0,  8, 22],
       [ 0,  0,  0,  0, 10, 25],
       [ 0,  0,  0,  0,  8, 82]], dtype=int64)

In [48]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [49]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
x = dataset.data
y = dataset.target

In [50]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
x, y, random_state=3
)

In [51]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)

0.9370629370629371

In [52]:
bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
                           n_estimators=10, max_samples=0.5,
                           bootstrap=True, random_state=3, oob_score=True)

In [53]:
bag_knn.oob_score

True

In [54]:
bag_knn.fit(x_train,y_train)
bag_knn.score(x_test,y_test)

0.9370629370629371

In [55]:
pasting_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
                           n_estimators=10, max_samples=0.5, 
                                bootstrap=False, random_state=3, oob_score=False)

In [56]:
pasting_knn.fit(x_train,y_train)
pasting_knn.score(x_test,y_test)

0.9300699300699301

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [58]:
x_train,y_train = make_classification(n_samples=1000, n_features=30,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)

In [59]:
clf = RandomForestClassifier(max_depth=3, random_state=0)

In [60]:
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.6293706293706294

In [64]:
import pickle
with open('StudentGradingmodelForPrediction.sav', 'wb') as f:
    pickle.dump(dtc,f)
    
with open('sandardScalar.sav', 'wb') as f:
    pickle.dump(scalar,f)