In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot
import missingno as msno
import category_encoders as ce
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import pickle
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#reading the data
data = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.info()

# Exploratory Data Analysis

In [None]:
#Checking Null Values
data.isnull().sum()

Analysis:

There is no any missing data in the dataset

# checking the unique values in the dataset

In [None]:
print(data.classroom.unique())
print(data.classroom.unique().shape)

In [None]:
print(data.school.unique())
print(data.school.unique().shape)

In [None]:
data.school_type.unique()

In [None]:
data.school_setting.unique()

In [None]:
data.teaching_method.unique()

In [None]:
data.lunch.unique()

In [None]:
data.dtypes

In [None]:
# checkign how the categorical data is distributed
for i in data.select_dtypes(include='object').columns:
    print("-----Categorical_feature----",i)
    print('\n')
    print("number of categories in "+" "+ i,len(data[i].value_counts()))
    print('\n')
    print(data[i].value_counts())   

In [None]:
#dropping student_id since its id column
data=data.drop('student_id',axis=1)
data.head()

In [None]:
#showing the count of the schools by school_setting
fig = px.histogram(data, x='school_setting', title="Count of Schools by School_setting", color = 'school_setting')
fig.show()

Analysis:

It can be observed that Maximum number schools are in urban from above graph

In [None]:
#grouping the data based on school_setting
school_setting_data = pd.DataFrame(data.groupby(['school_setting'])['posttest','n_student'].mean().reset_index())
school_setting_data

In [None]:
fig = px.bar(school_setting_data, x='school_setting',y='posttest', title="School's Zone impact on test score", color = 'n_student')
fig.show()

#  Analysis

* Students from suburbans are performing well than the students from Urbans areas and rurals areas.
* Urban schools have more students than Rural and suburban this is the reason for getting less score by urban areas students

In [None]:
#checking the gender distribution
fig = px.histogram(data, x='gender', title="Count of Students by Gender", color = 'gender')
fig.show()

Analysis

* Both genders are equally present

In [None]:
#grouping posttest score based on teaching method
teaching_method = pd.DataFrame(data.groupby(['teaching_method'])['posttest'].mean().reset_index())
teaching_method

In [None]:

fig = px.histogram(data, x='teaching_method', title="Count of schools by teaching method", color = 'teaching_method')
fig.show()

Analysis

* The number of schools with experimental style are less than standard style

In [None]:
#checking posttest score based on teaching_method
fig = px.bar(teaching_method, x='teaching_method',y='posttest', title="Teaching method's impact on Test Score", color = 'posttest')
fig.show()

Analysis

* The number of schools with experimental startegy are less but the avg score of the students is more than that of the standard ones. 
* Teaching methid is an important feature for scoring good test score.

In [None]:
#calculating average posttest based on the school,school_type,teaching_method
school_data = pd.DataFrame(data.groupby(['school','school_type','teaching_method']).agg(n_students=('n_student','size'),posttest= ('posttest','mean')).reset_index())

In [None]:
school_data

In [None]:
fig = px.bar(school_data, x='school',y='n_students', title="Teaching method's impact on Test Score", color = 'teaching_method', hover_data=['school_type','posttest'], text = 'posttest')
fig.show()

Analysis

* There is only one school which is not supporting Experimental way of teaching and one school which is supporting only Experimental way of teaching.
* Almost all schools are supporting the Experimental way of teaching, and the results are significantly better.




In [None]:
# plotting correlation heatmap
filtere_data=pd.DataFrame(data,columns={"n_students","posttest"})

# setting the dimensions of the plot
fig, ax = plt.subplots(figsize=(15, 5))
  
# drawing the plot
dataplot = sns.heatmap(data.corr(), cmap="YlGnBu", annot=True, ax=ax)
plt.show()

# Feature Engineering

# Data Preperation

In [None]:
data.columns

In [None]:
#dropping gender column since it has no affect on posttest score
data=data.drop('gender',axis=1)

In [None]:
data.head()

In [None]:
#separating low and high cardinality features in separate dataframe
low_cardinality_features=['school_setting','school_type','teaching_method','lunch']
high_cardinality_features=['school','classroom']

In [None]:
low_cardinality_features

In [None]:
low_cardinality_features_data=pd.DataFrame(data,columns={"school_setting","school_type","teaching_method","lunch"})

In [None]:
high_cardinality_features_data=pd.DataFrame(data,columns={"school","classroom"})

In [None]:
#performing one-hot encoding in low cardinality feature
low_cardinality_features_encoded_data=pd.get_dummies(data=low_cardinality_features_data,drop_first=True)

In [None]:
low_cardinality_features_encoded_data.head()

# Performing one-hot encoding on most 10 frequent categories and put 0 for remaining.

In [None]:
one_hot_encoding_on_most_frequent_categories=data.copy()
one_hot_encoding_on_most_frequent_categories1=pd.DataFrame(one_hot_encoding_on_most_frequent_categories,columns={"school","classroom"})

In [None]:
#Checking the categories in the dataset
for i in one_hot_encoding_on_most_frequent_categories1.columns:
    print(i,":",len(one_hot_encoding_on_most_frequent_categories[i].unique()),"labels")

In [None]:
one_hot_encoding_on_most_frequent_categories1.school.value_counts().sort_values(ascending=False).head(10)

In [None]:
one_hot_encoding_on_most_frequent_categories1.classroom.value_counts().sort_values(ascending=False).head(10)

In [None]:

#print top 10 features for school
school_10=one_hot_encoding_on_most_frequent_categories1.school.value_counts().sort_values(ascending=False).head(10).index
school_10=list(school_10)
school_10

In [None]:
#print top 10 features for classroom
classroom_10=one_hot_encoding_on_most_frequent_categories1.classroom.value_counts().sort_values(ascending=False).head(10).index
classroom_10=list(classroom_10)
classroom_10

In [None]:

for categories in school_10:
    one_hot_encoding_on_most_frequent_categories1[categories]=np.where(one_hot_encoding_on_most_frequent_categories1['school']==categories,1,0)

In [None]:
one_hot_encoding_on_most_frequent_categories1.head()

In [None]:
for categories in classroom_10:
    one_hot_encoding_on_most_frequent_categories1[categories]=np.where(one_hot_encoding_on_most_frequent_categories1['classroom']==categories,1,0)

In [None]:
one_hot_encoding_on_most_frequent_categories1.head()


In [None]:
one_hot_encoding_on_most_frequent_categories1_filtered_features=one_hot_encoding_on_most_frequent_categories1.iloc[:,2:22]
one_hot_encoding_on_most_frequent_categories1_filtered_features.head()

In [None]:
one_hot_encoding_on_most_frequent_categories1_filtered_features.shape

In [None]:

#combining the features
combined_features=pd.concat([one_hot_encoding_on_most_frequent_categories1_filtered_features, low_cardinality_features_encoded_data],axis=1)
combined_features.head()

In [None]:
combined_features.shape

In [None]:
Numerical_features=pd.DataFrame(data,columns={"n_student","pretest"})

In [None]:
training_features=pd.concat([Numerical_features,combined_features],axis=1)
training_features.head()

In [None]:
target_feature=pd.DataFrame(data,columns={"posttest"})

In [None]:
X =training_features.copy()
y = target_feature.copy()

# Model Training

# DecisionTreeRegressor

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.20,random_state = 42)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:

tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [None]:
dt_predictions = tree.predict(X_test)

In [None]:


print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, dt_predictions))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, dt_predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, dt_predictions)))
print('R^2:', metrics.r2_score(y_test, dt_predictions))

# Random Forest Regressor

In [None]:
randomforest=RandomForestRegressor()
randomforest.fit(X_train,y_train)

In [None]:
randomforest_prediction=randomforest.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, randomforest_prediction))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, randomforest_prediction))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, randomforest_prediction)))
print('R^2:', metrics.r2_score(y_test, randomforest_prediction))

# KNN

In [None]:
regressor=KNeighborsRegressor()
regressor.fit(X_train,y_train)

In [None]:
knn_prediction=regressor.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, knn_prediction))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, knn_prediction))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, knn_prediction)))
print('R^2:', metrics.r2_score(y_test, knn_prediction))

# xgboost

In [None]:
xgboost_model = XGBRegressor()
xgboost_model.fit(X, y)
xgboost_prediction=regressor.predict(X_test)


In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, xgboost_prediction))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, xgboost_prediction))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, xgboost_prediction)))
print('R^2:', metrics.r2_score(y_test, xgboost_prediction))

# 

# HyperParameter

In [None]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 3000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
#[2, 5, 10,14]
min_samples_split = [3, 6, 11,16]
# Minimum number of samples required at each leaf node
#[1, 2, 4,6,8]
min_samples_leaf = [3, 5, 7,9,11]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }
print(random_grid)

In [None]:

rf=RandomForestRegressor()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=10,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

In [None]:
rf_randomcv.best_params_

In [None]:
best_random_grid=rf_randomcv.best_estimator_

In [None]:
optimized_pred=best_random_grid.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, optimized_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, optimized_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, optimized_pred)))
print('R^2:', metrics.r2_score(y_test, optimized_pred))

In [None]:

my_submission = pd.DataFrame(optimized_pred)
my_submission.to_csv('submission.csv', index=False)

In [None]:
data_prediction=pd.DataFrame(optimized_pred,columns={"Predicted_score"})
y_test1=y_test.copy()
y_test1=y_test1.reset_index()
y_test2=pd.DataFrame(y_test1,columns={"posttest"})


In [None]:
predicted_data_frame = pd.concat([data_prediction,y_test2],axis=1)
predicted_data_frame.head(10)

# downloading the model


In [None]:
filename = 'model.pkl'
pickle.dump(rf_randomcv, open(filename, 'wb'))

In [None]:
!ls