In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Data**

In [None]:
import seaborn as sns
import matplotlib as plt

data = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
data.head()

y = data['posttest']
X_features = ['school','school_setting','school_type','classroom','teaching_method','n_student','gender','lunch','pretest']
X = data[X_features]




# **Check info about data**

In [None]:
X.info()

**How we can see, data have 7 columns of dtype=object**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
from sklearn.preprocessing import LabelEncoder

s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_test = X_test.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_test[col] = label_encoder.transform(X_test[col])


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
def metrics(y_true, y_preds):#func by Kwadwo Ofosu
    r2 = r2_score(y_true, y_preds)
    mae = mean_absolute_error(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    
    metrics = {'r2_score' : r2,
               'mean absolute error' : round(mae, 2),
               'mean_squared error' : round(mse, 2)}
    
    print(f'R2_score: {r2 * 100:.2f}%')
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    
    return metrics


# ****Make heatmap with pretest, because pretest cor with postest +90%****

In [None]:
heatmap = sns.heatmap(label_X_test.corr()[['pretest']].sort_values(by='pretest', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Pretest', fontdict={'fontsize':20}, pad=30);
heatmap.figure.set_size_inches(6, 6)

**lunch,n_student,school_tupe have most valuable correlation**

#                  Check each feature to see relations

In [None]:
ax = sns.boxplot(data=data, x='school_type', y='pretest', orient='v', width = 0.2)
ax.figure.set_size_inches(12, 6)
ax.set_title("Boxplot of Pretest Scores by School type", fontsize = 20)
ax.set_ylabel("Scores", fontsize = 16)
ax.set_xlabel("School", fontsize = 16)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
ax

In [None]:
ax = sns.boxplot(data=data, x='n_student', y='pretest', orient='v', width = 0.2)
ax.figure.set_size_inches(12, 6)
ax.set_title("Boxplot of Pretest Scores by School type", fontsize = 20)
ax.set_ylabel("Scores", fontsize = 16)
ax.set_xlabel("School", fontsize = 16)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
ax

In [None]:
ax = sns.boxplot(data=data, x='lunch', y='pretest', orient='v', width = 0.2)
ax.figure.set_size_inches(12, 6)
ax.set_title("Boxplot of Pretest Scores by School type", fontsize = 20)
ax.set_ylabel("Scores", fontsize = 16)
ax.set_xlabel("School", fontsize = 16)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
ax

**We can see that if u need to improove your test score, you need to have:**
1. **Number of students <23**
2. **Public school**
3. **Not-subsidized lunch**

# Time to make predict model

In [None]:
from sklearn.linear_model import LinearRegression 
def linear_r(X_train,X_test,y_train,y_test):

    linar_reg = LinearRegression()
    linar_reg.fit(label_X_train,y_train)
    y_pred=linar_reg.predict(label_X_test)
    linear_score = metrics(y_test,y_pred)
    
    sns.histplot(y_pred)
    print(metrics(y_pred,y_test))
linear_r(label_X_train,label_X_test,y_train,y_test)

In [None]:
from sklearn.linear_model import TweedieRegressor
def tweedie_r(X_train,X_test,y_train,y_test):
    
    tweedie_reg = TweedieRegressor(power=1, alpha=0.5, link='log')
    tweedie_reg.fit(X_train, y_train)
    y_pred = tweedie_reg.predict(X_test)
    
    sns.histplot(y_pred)
    print(metrics(y_pred,y_test))
tweedie_r(label_X_train,label_X_test,y_train,y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
def rfr_r(X_train,X_test,y_train,y_test):

    rfr = RandomForestRegressor()
    rfr.fit(label_X_train,y_train)
    y_pred = rfr.predict(label_X_test)

    rfr_score = metrics(y_pred,y_test)
    print(rfr_score)
    sns.histplot(y_pred)
rfr_r(label_X_train,label_X_test,y_train,y_test)

In [None]:
from sklearn.linear_model import Lasso
def lasso_r(X_train,X_test,y_train,y_test):

    lass = Lasso()
    lass.fit(label_X_train,y_train)
    y_pred = lass.predict(label_X_test)

    rfr_score = metrics(y_pred,y_test)
    print(rfr_score)
    sns.histplot(y_pred)
lasso_r(label_X_train,label_X_test,y_train,y_test)