In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression




df = pd.read_csv("../input/predict-test-scores-of-students/test_scores.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.school.unique()

In [None]:
df.n_student.unique()

In [None]:
df.posttest.unique()

In [None]:
df.pretest.unique()

In [None]:
print(df.corr())

When applied to an entire dataframe, the corr() function returns a dataframe of  pair-wise correlation between the columns. We can see that there’s a weak negative correlation between scores of n_students and pretest/posttest. Also, notice that the values on the diagonal are 1s, this is because each column is perfectly correlated with itself.

A positive correlation indicates that the values tend to increase with one another
A negative correlation indicates that values in one set tend to decrease with an increase in the other set

In [None]:
corr = df.corr()
 
sns.heatmap(corr, annot=True, linewidth=.6, linecolor="blue")

plt.show()

In [None]:
for x in ["school"]:
    for val in df[x].unique():
        count = df[x].value_counts()[val]
        percent = df[x].value_counts(normalize=True)[val] * 100
        print(f"{val} - Count: {count}, Percentage: {percent:.2f}%")
    print()

In [None]:
for x in ["school_setting", "school_type", "teaching_method", "gender", "lunch"]:
    for val in df[x].unique():
        count = df[x].value_counts()[val]
        percent = df[x].value_counts(normalize=True)[val] * 100
        print(f"{val} - Count: {count}, Percentage: {percent:.2f}%")
    print()

understanding the probability distribution between the variables of a dataframe


When a distribution has lower variability, the values in a dataset are more consistent. However, when the variability is higher, the data points are more dissimilar and extreme values become more likely. Consequently, understanding variability helps you grasp the likelihood of unusual events.

In [None]:
sns.distplot(df["n_student"])

From the above plot number of students in a class feature has a non-uniform in distribution. The mean and median values are values apart. As you can see the ‘n_student’ plot is right-skewed (long tail on the right) it has its mean greater than its median.



In [None]:
sns.distplot(df["pretest"])

The aboveplot is a uniform distribution of values in the ‘pretest’ feature. Thus, the feature is perfectly formatted with mean and median values close to each other.



In [None]:
sns.distplot(df["posttest"])

The above plot is a uniform distribution of values in the ‘posttest’ feature. Thus, the feature is perfectly formatted with mean and median values close to each other.

In [None]:
sns.distplot(df[["pretest","posttest"]])

*Our* both test scores are in normal distribution
so, our data can yied good machine learning model

In [None]:
sns.pairplot(df, x_vars=["n_student"], y_vars=["posttest"],height=8, aspect=1.5, kind="reg");

In [None]:
sns.pairplot(df, x_vars=["n_student"], y_vars=["pretest"],height=8, aspect=1.5, kind="reg");

We can clearly observe from below regressions that when strength of class is less then the pretest and posttest score is increased


In [None]:
sns.displot(df, x="pretest", hue="n_student", kind="kde",palette="Set1")

If you observe clearly the uniform distribution of classes are high and non-uniform distribution of classes is low in the pretest scores



In [None]:
sns.displot(df, x="posttest", hue="n_student", kind="kde",palette="Set1")

If you observe clearly the uniform distribution of classes are high and non-uniform distribution of classes is low in the posttest scores



In [None]:
sns.lmplot(x="pretest", y="posttest", hue="n_student", col="school", data=df, palette="Set1");

From the above regression, i observed the IDGFP has highest test scores and GOOBU, KZKKE, VVTA schools has less test scores

In [None]:
sns.lmplot(x="pretest", y="posttest", hue="n_student", col="school_type", data=df, palette="Set1");

In [None]:
sns.lmplot(x="pretest", y="posttest", hue="n_student", col="school_setting", data=df, palette="Set1");

As observed, urban area has major number of students who took both pretest and posttest and also urban area students has highest score in both pretest and posttest scores

In [None]:
sns.relplot(x="pretest", y="posttest", hue="n_student", col="gender", data=df,  palette="Set1");

Both Male and female has highest test scores and class 17 & 26 stands first place in top scores in both phases

In [None]:
sns.relplot(x="pretest", y="posttest", hue="n_student", col="teaching_method", ci=None, kind="scatter", data=df, palette="Set1");

Class 26 and 17 stands top scorers in experimental teaching in posttest
class 16 and 25 stands top scorers in standard teaching in pretest


In [None]:
sns.barplot(x="n_student", y="pretest", hue="lunch",data=df,palette="Set1");

In [None]:

sns.barplot(x="n_student", y="posttest", hue="lunch",data=df,palette="Set2");
#sns.lmplot(x="pretest", y="posttest", hue="n_student", col="lunch", data=df, palette="Set1");

Classes has more in doesnot qualified for lunch even they have high test scores

In [None]:
sns.relplot(x="n_student", y="pretest", hue="gender",style="lunch",col="teaching_method", ci=None, kind="line", data=df, palette="Set1");


In [None]:
sns.relplot(x="n_student", y="posttest", hue="gender", style="lunch",col="teaching_method", ci=None, kind="line", data=df, palette="Set2");


Comparing above two charts for students I observed the rate of posttest score increased than pretest score who qualified for lunch 

large number of population of students who doesnot qualified for lunch has highest pretest and posttest score.

In [None]:
sns.pairplot(df[['school_setting', 'school_type', 'teaching_method', 'n_student', 'gender', 'lunch', 'pretest', 'posttest']])


In [None]:
sns.pairplot(df[['school_setting', 'school_type', 'teaching_method', 'n_student', 'gender', 'lunch', 'pretest', 'posttest']], kind="kde")


##Data Modeling

In [None]:
df2 = df.drop(['classroom','student_id'], axis = 1)
df2.head()

In [None]:
features = pd.get_dummies(df2)

features.rename(columns = {'school_type_Non-public' : 'school_type_Non_public','lunch_Does not qualify':'lunch_Does_not_qualify', 'lunch_Qualifies for reduced/free lunch':'lunch_Qualifies_for_reduced/free_lunch'}, inplace = True)


features.head()

In [None]:
X = features.drop('posttest', axis=1)
y = features["posttest"]

In [None]:
y.head()

In [None]:
# Splitting the data set 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
from sklearn.metrics import explained_variance_score, mean_absolute_error

def test_score(y_test, y_pred):
    """Helper function for evaluation metrics."""
    accuracy = explained_variance_score(y_test, y_pred) * 100
    mae = round(mean_absolute_error(y_test, y_pred), 2)
    print(f"""accuracy: {accuracy:.2f}""")
    print(f"""MAE: {mae:.2f}""")
  
    return accuracy

In [None]:
accuracy_scores = np.zeros(11, dtype="float64")

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
accuracy_scores[0] = test_score(y_test, y_pred)

In [None]:
#Lasso Regression
from sklearn.linear_model import LassoCV

reg1 = LassoCV().fit(X_train, y_train)
y_pred1 = reg1.predict(X_test)
accuracy_scores[1] = test_score(y_test, y_pred1)

In [None]:
#Descision Tree Regression
from sklearn.tree import DecisionTreeRegressor

reg2 = DecisionTreeRegressor().fit(X_train, y_train)
y_pred2 = reg2.predict(X_test)
accuracy_scores[2] = test_score(y_test, y_pred2)

In [None]:
#Support Vector Regressor
from sklearn.svm import SVR

reg3 = SVR().fit(X_train, y_train)
y_pred3 = reg3.predict(X_test)
accuracy_scores[3] = test_score(y_test, y_pred3)

In [None]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

reg4 = RandomForestRegressor().fit(X_train, y_train)
y_pred4 = reg4.predict(X_test)
accuracy_scores[4] = test_score(y_test, y_pred4)

In [None]:
#Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

reg5 = GradientBoostingRegressor()
#n_estimators=100, random_state=42
reg5.fit(X_train, y_train)
y_pred5 = reg5.predict(X_test)
accuracy_scores[5] = test_score(y_test, y_pred5)

In [None]:
#XGBoost Regressor
from xgboost import XGBRegressor

xg_model = XGBRegressor()
xg_model.fit(X_train, y_train)
xg_pred = xg_model.predict(X_test)

accuracy_scores[6] = test_score(y_test, xg_pred)

In [None]:
#LightGBM Regressor
import lightgbm 
lgb_model = lightgbm.LGBMRegressor()
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)

accuracy_scores[7] = test_score(y_test, lgb_pred)

In [None]:
#AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor
ABR_model = AdaBoostRegressor()
ABR_model.fit(X_train, y_train)
ABR_pred = ABR_model.predict(X_test)

accuracy_scores[8] = test_score(y_test, ABR_pred)


In [None]:
#Regression with Tensorflow
#pip install --upgrade tensorflow
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Dense, Activation,Dropout
from tensorflow.keras.models import Model
print(tf.__version__)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

input_layer = Input(shape=(X.shape[1],))
dense_layer_1 = Dense(100, activation='relu')(input_layer)
dense_layer_2 = Dense(50, activation='relu')(dense_layer_1)
dense_layer_3 = Dense(25, activation='relu')(dense_layer_2)
output = Dense(1)(dense_layer_3)

model = Model(inputs=input_layer, outputs=output)
model.compile(loss="mean_squared_error" , optimizer="adam", metrics=["mae"])


In [None]:
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=100, verbose=1, validation_split=0.2)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
tensor_pred = model.predict(X_test)

accuracy_scores[9] = test_score(y_test, tensor_pred)

Make predictions
Finally, predict have a look at the errors made by the model when making predictions on the test set:

In [None]:
test_predictions = model.predict(X_test).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_test, tensor_pred)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 100]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

It looks like the model predicts reasonably well.

Now take a look at the error distribution:

In [None]:
error = test_predictions - y_pred
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')

Mean Squared Error (MSE) and Mean Absolute Error (MAE) are common loss functions used for regression problems. Mean Absolute Error is less sensitive to outliers. Different loss functions are used for classification problems.

Here i check with cross validation and hyper parameter tunning

In [None]:
from sklearn.model_selection import KFold
lgb_model = lightgbm.LGBMRegressor()
kfold_validation=KFold(10)


In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
results=cross_val_score(lgb_model,X,y,cv=kfold_validation)
print(results)
print(np.mean(results))

In [None]:
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=5)
lgb_model=lightgbm.LGBMRegressor()
scores=cross_val_score(lgb_model,X,y,cv=skfold)
print(np.mean(scores))

Hyper Parameter tuning for LightGBM model


In [None]:
## We use this parameters in LightGBM regressor
## Hyper Parameter Optimization
n_estimators = [100, 500, 900, 1100, 1500]  #Number of Decision Trees
max_depth = [2, 3, 5, 10, 15]
base_score=[0.25,0.5,0.75,1]
booster=['gbtree','gblinear'] #By default it select gbtree but i gave just to see how it performs
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=lgb_model,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
random_cv.fit(X_train,y_train)

In [None]:
random_cv.best_estimator_

In [None]:
random_cv.best_estimator_

In [None]:
lgb_reg = lightgbm.LGBMRegressor(base_score=1, booster='gbtree', learning_rate=0.2, max_depth=2,
              min_child_weight=3, n_estimators=1500)

In [None]:
lgb_reg.fit(X_train, y_train)

In [None]:
lgb_pred1 = lgb_reg.predict(X_test)

#accuracy_scores[10] = test_score(y_test, lgb_pred)

In [None]:
accuracy_scores[10] = test_score(y_test, lgb_pred1)

In [None]:
sns.set_style('whitegrid')

models = ["Linear Regression","Lasso Regressor","Decision Tree Regressor","Support Vector Regressor","Random Forest Regressor","Gradient boost Regressor","XGBoost Regressor","LightGBM REgressor","Ada Boost Regressor","Tensor Regressor","XG Boost Hyper"]


plt.figure(figsize=(11, 11))
sns.barplot(x=accuracy_scores, y=models)


plt.xlabel("Model_Name")
plt.xticks(rotation = -90)
plt.ylabel("Accuracy")

plt.show()

In [None]:
sns.set_style('whitegrid')
models = ["Linear Regression","Lasso Regressor","Decision Tree Regressor","Support Vector Regressor","Random Forest Regressor","Gradient boost Regressor","XGBoost Regressor","LightGBM REgressor","Tensor Regressor","AdaBoost Regressor"]

mae = ["2.50","2.61","3.23","3.38","2.63","2.48","2.48","2.48","2.62","2.82"]

plt.figure(figsize=(11, 11))
sns.relplot(x=models, y=mae)

plt.xlabel("Model_Name")
plt.xticks(rotation = -90)
plt.ylabel("Accuracy")

plt.show()

Conclusion


From the above comparision of all algorithms with thier acccuracy and mean_absolute_error

LightGBM boosting model performs well with accuracy of 95.04 and mae of 2.48

If you clearly observe the accuracy and mean_absolute error yielded by boosting algorithms are similar nearly so, Boosting models works well on the dataset.