In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
df = pd.read_csv("/Users/sanketsaxena/Desktop/studentPerformance/src/NOTEBOOKS/data/stud.csv")
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X = df.drop("math_score",axis=1)
Y = df['math_score']

In [5]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [6]:
Y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [45]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [46]:
num_features = [features for features in X_train.columns if(X_train[features].dtype != 'O')]
cat_features = [features for features in X_train.columns if(X_train[features].dtype == 'O')]

In [47]:
num_features

['reading_score', 'writing_score']

In [48]:
cat_features

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [49]:
X_train.nunique()

gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
reading_score                  69
writing_score                  71
dtype: int64

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [51]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()


In [52]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
    )


In [53]:
X_train.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57


In [54]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [55]:
X_test.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
521,female,group C,associate's degree,standard,none,86,84
737,female,group B,some college,free/reduced,completed,66,73
740,male,group D,bachelor's degree,standard,none,73,72
660,male,group C,some college,free/reduced,none,77,73
411,male,group E,some college,standard,completed,83,78


In [56]:
X_test_transformed = preprocessor.transform(X_test)

In [58]:
X_train_transformed = pd.DataFrame(X_train_transformed)

In [60]:
X_test_transformed = pd.DataFrame(X_test_transformed)

In [61]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       800 non-null    float64
 1   1       800 non-null    float64
 2   2       800 non-null    float64
 3   3       800 non-null    float64
 4   4       800 non-null    float64
 5   5       800 non-null    float64
 6   6       800 non-null    float64
 7   7       800 non-null    float64
 8   8       800 non-null    float64
 9   9       800 non-null    float64
 10  10      800 non-null    float64
 11  11      800 non-null    float64
 12  12      800 non-null    float64
 13  13      800 non-null    float64
 14  14      800 non-null    float64
 15  15      800 non-null    float64
 16  16      800 non-null    float64
 17  17      800 non-null    float64
 18  18      800 non-null    float64
dtypes: float64(19)
memory usage: 118.9 KB


In [64]:
X_test_transformed.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
dtype: int64

In [65]:
import pickle

In [72]:
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [75]:
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Support Vector Machine': (SVR(), {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [50, 100, 150]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 10, 20]})
}

In [76]:
from sklearn.model_selection import GridSearchCV

In [88]:
bestModel  = None
r2prev = -1
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train_transformed, Y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test_transformed)
    
    # Calculate R^2 score
    r2 = r2_score(Y_test, y_pred)
    if(r2 > r2prev):
        bestModel = best_model
        r2prev = r2
    # Print results
    

LinearRegression()
-1 ------- 0.8804332983749565
SVR(C=10, kernel='linear')
0.8804332983749565 ------- 0.8804353935218923


In [93]:
model = pickle.load(open("/Users/sanketsaxena/Desktop/studentPerformance/Artifacts/model.pkl",'rb'))

In [94]:
y = model.predict(X_test_transformed)



In [104]:
preprocessor = pickle.load(open("/Users/sanketsaxena/Desktop/studentPerformance/Artifacts/preprocessor.pkl",'rb'))

In [118]:
X = pd.DataFrame(["female","group C","associate's degree","standard","none",86,84]).T

In [121]:
X

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group C,associate's degree,standard,none,86,84


In [132]:
X_test.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'reading_score', 'writing_score'],
      dtype='object')

In [122]:
X = preprocessor.transform(X)

In [123]:
print(model.predict(X))

[76.6411448]


