In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

## Feature and Target

In [None]:
y = df['posttest']
X = df.drop('posttest', axis=1)

In [None]:
y.head()

In [None]:
X.head()

In [None]:
X.groupby('school').agg('count')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
X.info()

In [None]:
object_columns = [col for col in X.columns if X[col].dtype == 'object']

In [None]:
object_columns

In [None]:
for i in object_columns:
    print(i)
    print(X[i].unique())
    print('*'*60)

In [None]:
X = X.drop('classroom', axis=1)

In [None]:
X = X.drop('student_id', axis=1)
X.head()

In [None]:
object_cols = [col for col in X.columns if X[col].dtype=='object']

## LabelEncoding

In [None]:
from sklearn.preprocessing import LabelEncoder
label_X = X.copy()

label_encoder = LabelEncoder()
for col in object_cols:
    label_X[col] = label_encoder.fit_transform(X[col])

In [None]:
label_X.head()

In [None]:
df2 = df.drop(['student_id', 'classroom'], axis=1)
df2.head()

In [None]:
label_encoder = LabelEncoder()
for col in object_cols:
    df2[col] = label_encoder.fit_transform(X[col])

In [None]:
df2.head()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(df2.corr(), annot=True, annot_kws={'size':9}, xticklabels=df2.columns, yticklabels=df2.columns, ax=ax)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(6, 6))
df['school_setting'].value_counts().plot(kind='bar')
plt.title('School Setting')


In [None]:
print(df['school_type'].value_counts())

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = df['school_type'].value_counts().keys()
values = df['school_type'].value_counts()

# # Use `hole` to create a donut-like pie chart
# fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5)])
# fig.show()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=values, name="School Type"))
fig.update_traces(hole=.5, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="School Type")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df['teaching_method'].value_counts().keys(), values=df['teaching_method'].value_counts(), name="Teaching Methods"))
fig.update_traces(hole=.5, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Teaching Methods")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df['school_setting'].value_counts().keys(), values=df['school_setting'].value_counts(), name="School Setting"))
fig.update_traces(hole=.5, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="School Setting")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df['lunch'].value_counts().keys(), values=df['lunch'].value_counts(), name="Lunch"))
fig.update_traces(hole=.5, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Lunch")
fig.show()

In [None]:
# student frequency
import plotly.express as px
fig = px.histogram(df['n_student'])
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(label_X, y, random_state=0, test_size=0.25)

# DecisionTreeRegressor

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

model1 = DecisionTreeRegressor()
model1.fit(train_x, train_y)


In [None]:
pred_y1 = model1.predict(test_x)

In [None]:
mean_absolute_error(test_y, pred_y1)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
r2_score(test_y, pred_y1)

In [None]:
mean_squared_error(test_y, pred_y1)

In [None]:
def getting_error_metric(max_leaf_nodes, train_x, test_x, train_y, test_y):
    model2 = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model2.fit(train_x, train_y)
    pred_y2 = model2.predict(test_x)
    mae = mean_absolute_error(test_y, pred_y2)
    mse = mean_squared_error(test_y, pred_y2)
    r_square = r2_score(test_y, pred_y2)
    return mae, mse, r_square

for i in [5, 50, 500, 5000]:
    print('For max_leaf_nodes = ', i)
    print('MAE, MSE, R_Square', getting_error_metric(i, train_x, test_x, train_y, test_y))
    

Highest accuracy using max_leaf_nodes = 50

In [None]:
label_X.head()

In [None]:
plt.scatter(X['pretest'], y)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model3 = LinearRegression()

model3.fit(train_x, train_y)

In [None]:
pred_y3 = model3.predict(test_x)
mean_absolute_error(test_y, pred_y3)


In [None]:
r2_score(test_y, pred_y3)

Linear Regression gave a better output than the decision tree

# RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model4 = RandomForestRegressor(random_state=0)
model4.fit(train_x, train_y)
pred_y4 = model4.predict(test_x)

print('Mean Absolute Error = {}, Mean Squared Error = {}, R^2 = {}'.format(mean_absolute_error(test_y, pred_y4), mean_squared_error(test_y, pred_y4), r2_score(test_y, pred_y4)))

Using parameter n_estimators

In [None]:
def get_metrics_rf(n_estimator, train_x, test_x, train_y, test_y):
    model5 = RandomForestRegressor(n_estimators = n_estimator, random_state=0)
    model5.fit(train_x, train_y)
    pred_y5 = model5.predict(test_x)
    return mean_absolute_error(test_y, pred_y5), mean_squared_error(test_y, pred_y5), r2_score(test_y, pred_y5)

estimators = [int(i) for i in np.linspace(start=100, stop=500, num=10)]
for i in estimators:
    print('n_estimators value = ',i)
    print('The MAE, MSE, R^2 values are ', get_metrics_rf(i, train_x, test_x, train_y, test_y))

In [None]:
random_forest = RandomForestRegressor(n_estimators = 500, random_state=0)
random_forest.fit(train_x, train_y)
predictions_y = random_forest.predict(test_x)
print(r2_score(test_y, predictions_y))

In [None]:
import joblib
joblib.dump(random_forest, 'predict_test_scores')

In [None]:
!ls