# PACKAGES AND LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import scale
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve

In [None]:
filterwarnings("ignore", category=DeprecationWarning) 
filterwarnings("ignore", category=FutureWarning) 
filterwarnings("ignore", category=UserWarning) 

# HISTORY

* NOR - Total number of pupils on roll
* PNORG - Percentage of girls on roll
* PNORB - Percentage of boys on roll
* School_Phase - Phase of School (Type)
* Total_Teachers - Total Number of Teachers
* Total_Teaching_Assistants - Number of Teaching Assistants (Headcount)
* Total_Instructor - Total_Teachers + Total_Teaching_Assistants
* School_Support - Total Number of Non Classroom-based School Support Staff, Excluding Auxiliary Staff
* Full_Time_Teachers - Percentage of full time teachers
* Pupil_Teacher_Ratio - Students per teacher
* GFTE - Gross full time employment salary for teachers for one year
* GPS_AVERAGE - Grammer Score Average
* MAT_AVERAGE - Math Score Average
* READ_AVERAGE - Reading Score Average


# Data Source

In [None]:
SchoolData = pd.read_csv("../input/schooldata/schoolpredict.csv")
data = SchoolData.copy() # to protect main data

In [None]:
data.rename(columns={"Total_Teaching_Assistants ":"Teaching_Assistants"},inplace=True)
data.rename(columns={"School_0ort":"School_Support"},inplace=True)

In [None]:
data.drop("Pupil_Teacher_Ratio",axis=1,inplace=True)
data.drop("Full_Time_Teachers",axis=1,inplace=True)

In [None]:
data["Total_Instructor"] = data["Total_Teachers"] + data["Teaching_Assistants"]
data["Pupil_Teacher_Ratio"] = data["NOR"] / data["Total_Instructor"]

In [None]:
data.drop("Total_Teachers",axis=1,inplace=True)
data.drop("Teaching_Assistants",axis=1,inplace=True)

In [None]:
data["GPS_AVERAGE"] = data["GPS_AVERAGE"].astype(float)

In [None]:
data["MAT_AVERAGE"] = data["MAT_AVERAGE"].astype(float)

In [None]:
data["READ_AVERAGE"] = data["READ_AVERAGE"].astype(float)

In [None]:
data[["GPS_AVERAGE",
      "MAT_AVERAGE","READ_AVERAGE"]] = data[["GPS_AVERAGE",
                                             "MAT_AVERAGE","READ_AVERAGE"]].replace(0,np.NaN)

# EXPLORATORY DATA ANALYSIS

In [None]:
print(data.head())

In [None]:
print(data.shape)

In [None]:
print(data.columns)

In [None]:
print(data.info())

In [None]:
print(data.describe().T)

In [None]:
print(data.groupby(["Total_Instructor"])["NOR"].mean())

In [None]:
print(data.where(data["NOR"] < 200).value_counts().sum())

In [None]:
print(data.where(data["Total_Instructor"] > 100).value_counts().sum())

In [None]:
print(data.where(data["Total_Instructor"] < 5).value_counts().sum())

In [None]:
print(data.where(data["GFTE"] > 40000).value_counts().sum())

In [None]:
print(data.where(data["GFTE"] > data["GFTE"].mean()).value_counts().sum())

In [None]:
print(data.duplicated().value_counts())

In [None]:
print(data.isnull().all())

In [None]:
print(data.isnull().sum())

# MISSING VALUES

In [None]:
msno.matrix(data,figsize=(8,5))
plt.show()

In [None]:
msno.bar(data,figsize=(8,5))
plt.show()

In [None]:
msno.heatmap(data,figsize=(8,5))
plt.show()

In [None]:
data["GFTE"].fillna(data.groupby("School_Phase")["GFTE"].transform("mean"), inplace=True)
data["School_Support"].fillna(data.groupby("School_Phase")["School_Support"].transform("mean"), inplace=True)
data["Total_Instructor"].fillna(data.groupby("School_Phase")["Total_Instructor"].transform("mean"), inplace=True)
data["Pupil_Teacher_Ratio"].fillna(data.groupby("School_Phase")["Pupil_Teacher_Ratio"].transform("mean"), inplace=True)
data["GPS_AVERAGE"].fillna(data.groupby("School_Phase")["GPS_AVERAGE"].transform("mean"), inplace=True)
data["MAT_AVERAGE"].fillna(data.groupby("School_Phase")["MAT_AVERAGE"].transform("mean"), inplace=True)
data["READ_AVERAGE"].fillna(data.groupby("School_Phase")["READ_AVERAGE"].transform("mean"), inplace=True)
# filling missing values based on School_Phase
# School_Phase is the best option to fill

In [None]:
data["Success_Mean"] = ((data["GPS_AVERAGE"] + data["MAT_AVERAGE"] + data["READ_AVERAGE"]) / 3) / data["Pupil_Teacher_Ratio"]
# we will use this new features as target
# this formula is to prevent over-fitting

In [None]:
print(data.head())

In [None]:
print(data.isnull().sum())

In [None]:
df = data.select_dtypes(include=["float64","int64","int32"]) # for corr and outlier

In [None]:
print(df.corr())

# OUTLIER

In [None]:
DataForA = df.copy() # to protect main data

In [None]:
clf = LocalOutlierFactor()
clf.fit_predict(df)

In [None]:
score = clf.negative_outlier_factor_

In [None]:
sortedScore = np.sort(score)
print(sortedScore[0:70])

In [None]:
point = sortedScore[3]

In [None]:
print(f"Outlier Point -- > {point} ")
print("---"*20)
print("Outlier Row -- >\n",DataForA[score==point])

In [None]:
totaloutlier = DataForA < point
print(DataForA[totaloutlier].any())

# ENCODE 

In [None]:
encode = LabelEncoder()

In [None]:
print(data["School_Phase"].value_counts())

In [None]:
data["School_Phase"] = encode.fit_transform(data["School_Phase"])

In [None]:
print(data["School_Phase"].value_counts())

* Primary                    4
* Secondary                  5
* All through                1
* Middle deemed secondary    3
* 16 plus                    0
* Middle deemed primary      2

# CORRELATION - COVARIANCE - NORMALITY - HOMOGENEITY

#### Correlation

In [None]:
corrPearson = data.corr(method="pearson")
corrSpearman = data.corr(method="spearman")

In [None]:
figure = plt.figure(figsize=(20,8))
sns.heatmap(corrPearson,annot=True,vmin=-1,center=0,vmax=1)
plt.title("PEARSON")
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.heatmap(corrSpearman,annot=True,vmin=-1,center=0,vmax=1)
plt.title("SPEARMAN")
plt.show()

#### Covariance

In [None]:
covv = data.cov()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.heatmap(covv,annot=True,vmin=-1,center=0,vmax=1)
plt.title("COVARIANCE")
plt.show()

#### Normality

In [None]:
for i in data.columns:
    print("---"*30)
    print(i)
    print("%.4f - %.4f" % shapiro(data[i]))

#### Homogeneity

In [None]:
print("%.3f - %.3f" % levene(data["Pupil_Teacher_Ratio"],data["NOR"]))
print("%.3f - %.3f" % levene(data["Pupil_Teacher_Ratio"],data["School_Phase"]))
print("%.3f - %.3f" % levene(data["Pupil_Teacher_Ratio"],data["GFTE"]))
print("%.3f - %.3f" % levene(data["Pupil_Teacher_Ratio"],data["School_Support"]))

# VISUALIZATION

#### GENERAL HISTOGAM

In [None]:
data.hist(figsize=(20,20))
plt.show()

#### BOX PLOT

In [None]:
dataV = data.copy() # to protect main data
dataV["School_Phase"] = pd.Categorical(dataV["School_Phase"])

comparison between School_Phase and Pupil_Teacher_Ratio

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="School_Phase",y="Pupil_Teacher_Ratio",data=dataV)
plt.show()

comparison between School_Phase and GFTE

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="School_Phase",y="GFTE",data=dataV)
plt.show()

comparison between School_Phase and NOR

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="School_Phase",y="NOR",data=dataV)
plt.show()

#### BARPLOT

comparison between School_Phase and Pupil_Teacher_Ratio

In [None]:
figure = plt.figure(figsize=(20,8))
sns.barplot(x="School_Phase",y="Pupil_Teacher_Ratio",data=dataV)
plt.show()

#### JOINTGRID & KDEPLOT

comparison between School_Phase and Success_Mean

In [None]:
figure = plt.figure(figsize=(20,8))
g = sns.JointGrid(data=data, x="Pupil_Teacher_Ratio", y="Success_Mean", space=0)
g.plot_joint(sns.kdeplot,cmap="rocket")
plt.show()

comparison between NOR and Success_Mean

In [None]:
figure = plt.figure(figsize=(20,8))
g = sns.JointGrid(data=data, x="NOR", y="Success_Mean", space=0)
g.plot_joint(sns.kdeplot,cmap="rocket")
plt.show()

#### SCATTERPLOT

comparison between Pupil_Teacher_Ratio and NOR based on School_Phase

In [None]:
figure = plt.figure(figsize=(15,8))
sns.scatterplot(x="Pupil_Teacher_Ratio",y="NOR",hue="School_Phase",data=dataV)
plt.show()

comparison between Total_Instructor and NOR based on School_Phase

In [None]:
figure = plt.figure(figsize=(15,8))
sns.scatterplot(x="Total_Instructor",y="NOR",hue="School_Phase",data=dataV)
plt.show()

comparison between Total_Instructor and GFTE based on School_Phase

In [None]:
figure = plt.figure(figsize=(15,8))
sns.scatterplot(x="Total_Instructor",y="GFTE",hue="School_Phase",data=dataV)
plt.show()

#### JOINPLOT

comparison between Total_Instructor and NOR based on School_Phase

In [None]:
figure = plt.figure(figsize=(15,8))
sns.lineplot(x="Total_Instructor",y="NOR",hue="School_Phase",data=dataV)
plt.show()

comparison between Total_Instructor and Success_Mean based on School_Phase

In [None]:
figure = plt.figure(figsize=(15,8))
sns.lineplot(x="Total_Instructor",y="Success_Mean",hue="School_Phase",data=dataV)
plt.show()

comparison between Total_Instructor and Success_Mean

In [None]:
figure = plt.figure(figsize=(15,8))
sns.lineplot(x="Total_Instructor",y="Success_Mean",data=dataV)
plt.show()

comparison Total_Instructor based on School_Phase

#### HISTPLOT

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="Total_Instructor", hue="School_Phase",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

comparison Pupil_Teacher_Ratio based on School_Phase

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="Pupil_Teacher_Ratio", hue="School_Phase",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

comparison between Pupil_Teacher_Ratio and NOR

#### JOINPLOT

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="Pupil_Teacher_Ratio",y="NOR", color="#4CB391",data=data)
plt.show()

comparison between Pupil_Teacher_Ratio and School_Support

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="Pupil_Teacher_Ratio",y="School_Support", color="#4CB391",data=data)
plt.show()

comparison between Pupil_Teacher_Ratio and Pupil_Teacher_Ratio mean based on NOR

* BAD and GOOD label


As the number of students increases, the number of students per teacher increases

#### DISTPLOT

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Pupil_Teacher_Ratio'] > data['Pupil_Teacher_Ratio'].mean()]["NOR"], 
             color='black',label='BAD') 
sns.distplot(data[data['Pupil_Teacher_Ratio'] < data['Pupil_Teacher_Ratio'].mean()]["NOR"], 
             color='red',label='GOOD')
plt.title('Pupil_Teacher_Ratio', fontsize=10)
plt.legend()


comparison between Pupil_Teacher_Ratio and Pupil_Teacher_Ratio mean based on School_Phase

* BAD and GOOD label

Primary 4 / 
Secondary 5

* At 5, the number of students per teacher is much higher than that of 4.

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Pupil_Teacher_Ratio'] > data['Pupil_Teacher_Ratio'].mean()]["School_Phase"],
             color='black',label='BAD') 
sns.distplot(data[data['Pupil_Teacher_Ratio'] < data['Pupil_Teacher_Ratio'].mean()]["School_Phase"],
             color='red',label='GOOD')
plt.title('Pupil_Teacher_Ratio', fontsize=10)
plt.legend()

comparison between Pupil_Teacher_Ratio and Pupil_Teacher_Ratio mean based on GFTE

* BAD and GOOD label

There is no big difference alone

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Pupil_Teacher_Ratio'] > data['Pupil_Teacher_Ratio'].mean()]["GFTE"],
             color='black',label='BAD') 
sns.distplot(data[data['Pupil_Teacher_Ratio'] < data['Pupil_Teacher_Ratio'].mean()]["GFTE"],
             color='red',label='GOOD')
plt.title('Pupil_Teacher_Ratio', fontsize=10)
plt.legend()

comparison between Pupil_Teacher_Ratio and Pupil_Teacher_Ratio mean based on Success_Mean

* BAD and GOOD label

There is no big difference alone

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Pupil_Teacher_Ratio'] > data['Pupil_Teacher_Ratio'].mean()]["Success_Mean"],
             color='black',label='BAD') 
sns.distplot(data[data['Pupil_Teacher_Ratio'] < data['Pupil_Teacher_Ratio'].mean()]["Success_Mean"],
             color='red',label='GOOD')
plt.title('Pupil_Teacher_Ratio', fontsize=10)
plt.legend()

#### 3D

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["Pupil_Teacher_Ratio"], data["NOR"], data["School_Phase"], c="red", s=20, alpha=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["Pupil_Teacher_Ratio"], data["NOR"], data["GFTE"], c="black", s=20, alpha=0.2)
plt.show()

#### PAIRGRID

In [None]:
fig = plt.figure(figsize=(20,5))
sns.PairGrid(dataV, y_vars="Pupil_Teacher_Ratio",
                 x_vars=["School_Phase"],height=10,aspect=.5).map(sns.pointplot, scale=1.3, errwidth=2, color="black")
plt.show()

In [None]:
fig = plt.figure(figsize=(20,5))
sns.PairGrid(dataV, y_vars="Success_Mean",
                 x_vars=["School_Phase"],height=10,aspect=.5).map(sns.pointplot, scale=1.3, errwidth=2, color="black")
plt.show()

# SPECIAL CORRELATIONS

In [None]:
print(data["Pupil_Teacher_Ratio"].corr(data["School_Support"],method="spearman"))

In [None]:
print(data["Pupil_Teacher_Ratio"].corr(data["NOR"],method="spearman"))

In [None]:
print(data["Pupil_Teacher_Ratio"].corr(data["School_Phase"],method="spearman"))

In [None]:
print(data["Pupil_Teacher_Ratio"].corr(data["GFTE"],method="spearman"))

In [None]:
print(data["Pupil_Teacher_Ratio"].corr(data["Success_Mean"],method="spearman"))

# PREDICTION MODELS

In [None]:
x = data.drop(["GFTE","School_Support",
               "PNORG","PNORB",'GPS_AVERAGE','MAT_AVERAGE',
               "Total_Instructor",'READ_AVERAGE',"Success_Mean"],axis=1)
y = data["Success_Mean"]

In [None]:
print(data["Success_Mean"].mean())
# lower than average, we will classify education as okay
# higher othan average, we will classify education as bad

In [None]:
print(data["Success_Mean"].max())
# As the value approaches the maximum, the quality of education increases

In [None]:
print(data["Success_Mean"].min())
# As the value approaches the minimum, the quality of education increases

In [None]:
import statsmodels.stats.api as sms

In [None]:
print(sms.DescrStatsW(data["Success_Mean"]).tconfint_mean())

In [None]:
print(x.columns)
# features to be used for estimation

In [None]:
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state=42)
# for test and train

#### MODELS

In [None]:
lm = LinearRegression().fit(xTrain,yTrain)
pls = PLSRegression().fit(xTrain,yTrain)
ridge = Ridge().fit(xTrain,yTrain)
lasso = Lasso().fit(xTrain,yTrain)
elasticnet = ElasticNet().fit(xTrain,yTrain)
knnr = KNeighborsRegressor().fit(xTrain,yTrain)
cartr = DecisionTreeRegressor(random_state=42).fit(xTrain,yTrain)
baggr = BaggingRegressor(random_state=42,bootstrap_features=True,verbose=False).fit(xTrain,yTrain)
rfr = RandomForestRegressor(random_state=42,verbose=False).fit(xTrain,yTrain)
gbmr = GradientBoostingRegressor(verbose=False).fit(xTrain,yTrain)
xgbr = XGBRegressor().fit(xTrain,yTrain)
lgbmr = LGBMRegressor().fit(xTrain,yTrain)
catbr = CatBoostRegressor(verbose=False).fit(xTrain,yTrain)

In [None]:
models = [lm,pls,ridge,lasso,elasticnet,knnr,
         cartr,baggr,rfr,gbmr,xgbr,lgbmr,catbr]

#### ACCURACY AND MEAN SQUARED ERROR

In [None]:
for model in models:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,xTest,yTest,cv=10,scoring="r2").mean()
    error = -cross_val_score(model,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
    print(name + ": ")
    print("-" * 10)
    # first is R2
    print(R2CV)
    # second is Mean Squared Error
    print(np.sqrt(error))
    print("-" * 30)

Best is GradientBoostingRegressor -- > 0.98

In [None]:
data.to_csv("newschool.csv",index=False)

# TRYING MODEL

In [None]:
print(data.head())

'NOR', 'School_Phase', 'Pupil_Teacher_Ratio'

In [None]:
newfeaturesvalue = [[200], [4], [3]]
newfeaturesvalue = pd.DataFrame(newfeaturesvalue).T

In [None]:
predict = gbmr.predict(newfeaturesvalue)
print(predict)