# **Read the Data**

In [None]:
import pandas as pd
data=pd.read_csv('../input/student-grade-prediction/student-mat.csv')
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

# **Regression**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,4))
sns.heatmap(data.isnull(),cbar=False,cmap='viridis',yticklabels=False)
plt.title('Missing value in the dataset');

In [None]:
import numpy as np
numeric_columns=data.loc[:, data.columns != 'G3'].select_dtypes(include=np.number).columns.tolist()

In [None]:
data.boxplot(column=numeric_columns,fontsize=10,rot=0,grid=False,figsize=(10,10),vert=False)

In [None]:
# get IQR for each numeric column
Q1 = data[numeric_columns].quantile(0.25)
Q3 = data[numeric_columns].quantile(0.75)
IQR = Q3 - Q1
boxplot_min = Q1 - 1.5 * IQR
boxplot_max = Q3 + 1.5 * IQR
print('Q1:\n',Q1)
print('\nQ3:\n',Q3)
print('\nIQR:\n',IQR)
print('\nMin:\n',boxplot_min)
print('\nMax:\n',boxplot_max)

In [None]:
# remove outlier
non_outlier_data = data.copy()
for x in numeric_columns:
  filter_min = non_outlier_data[x]<boxplot_min[x]
  filter_max = non_outlier_data[x]>boxplot_max[x]
  non_outlier_data = non_outlier_data[~(
    filter_min|filter_max
    )]

In [None]:
non_outlier_data.head(5)

In [None]:
non_outlier_data.shape

In [None]:
# check correlation between column

correlation_between_columns = non_outlier_data[numeric_columns].corr()
upper_triangle_corr = np.triu(correlation_between_columns)

fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(correlation_between_columns, annot = True, cmap="YlGnBu",ax=ax,annot_kws={"size":15},mask=upper_triangle_corr)

In [None]:
# check linearity between target vs feature
scatter_plot_between_target_feature = sns.pairplot(
    data=non_outlier_data,
    y_vars=['G3'],
    x_vars=numeric_columns,
    height=3,
    kind='scatter'
    )

In [None]:
final_feature_column = ['G1','G2']
target_column = ['G3']
final_column = final_feature_column+target_column
final_data = non_outlier_data[final_column]

In [None]:
final_data.shape

In [None]:
final_data.head(10)

In [None]:
# prepare the data to numpy array
final_feature_array = final_data[final_feature_column].to_numpy()
target_array = final_data[target_column].to_numpy()

In [None]:
print('shape of final feature:',final_feature_array.shape)
print('shape of target:',target_array.shape)

In [None]:
import sklearn.model_selection as model_selection
# split the data into test and train
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    final_feature_array,
    target_array,
    train_size=0.8,
    random_state=0
    )

In [None]:
# check the shape data
print('Shape Data X Train:')
print(X_train.shape)
print('\nShape Data X Test:')
print(X_test.shape)
print('\nShape Data y Train:')
print(y_train.shape)
print('\nShape Data y Test:')
print(y_test.shape)

In [None]:
import sklearn.linear_model as linear_model
regression_model = linear_model.LinearRegression()

In [None]:
# train the data
regression_model.fit(X_train, y_train)

In [None]:
# model result
print('Coefficients:\n',regression_model.coef_)
# print('\n')
print('Intercept:',regression_model.intercept_)

In [None]:
# predict the X using trained model
y_train_pred = regression_model.predict(X_train)
y_test_pred = regression_model.predict(X_test)
target_array_pred = regression_model.predict(final_feature_array)

In [None]:
# check the raw prediction data & compare with real data
print('Real Data')
print(y_train[:5])
print('\n Predicted Data')
print(y_train_pred[:5])

In [None]:
# check the data in the form of dataframe
final_with_pred_data = final_data.copy()
final_with_pred_data['G3'] = target_array_pred.reshape(-1,)
final_with_pred_data.head(5)

In [None]:
# evaluate regression model - RMSE
from sklearn.metrics import mean_squared_error
rmse_training = mean_squared_error(y_true=y_train,y_pred=y_train_pred,squared=False)
rmse_test = mean_squared_error(y_true=y_test,y_pred=y_test_pred,squared=False)

print('RMSE Training Data: {}'.format(rmse_training))
print('RMSE Test Data: {}'.format(rmse_test))

In [None]:
# evaluate regression model - R squared
print('R^2 score:',regression_model.score(X_train, y_train))

In [None]:
from scipy import stats
residual = (abs(target_array) - abs(target_array_pred))
sw = stats.shapiro(residual)
ks = stats.kstest(residual, 'norm')

print('Shapiro-Wilk test ---- statistic: {}, p-value: {}'.format(sw[0],sw[1]))
print('Kolmogorov-Smirnov test ---- statistic: {}, p-value: {}'.format(ks.statistic,ks.pvalue))

In [None]:
import statsmodels.api as sm
model = sm.OLS(y_train,X_train)
results = model.fit()
print(results.summary())

# **Classification**

In [None]:
non_outlier_data.head()


In [None]:
non_outlier_data['avg_score'] = non_outlier_data[['G1', 'G2','G3']].mean(axis=1)
non_outlier_data.drop(['G1', 'G2','G3'], axis=1,inplace=True)
non_outlier_data.head()

In [None]:
numeric_columns=non_outlier_data.loc[:, non_outlier_data.columns != 'avg_score'].select_dtypes(include=np.number).columns.tolist()

In [None]:
# check correlation between column
correlation_between_column = non_outlier_data[numeric_columns].corr()
upper_triangle_corr = np.triu(correlation_between_column)

fig, ax = plt.subplots(figsize=(40,40))
sns.heatmap(correlation_between_column, annot = True, cmap="YlGnBu",ax=ax,annot_kws={"size":15},mask=upper_triangle_corr)

In [None]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
binary_cols = []

for col in non_outlier_data.columns:
    if non_outlier_data[col].nunique()==2:
        binary_cols.append(col)

In [None]:
for i in binary_cols:
    non_outlier_data[i] = LabelBinarizer().fit_transform(non_outlier_data[i])

In [None]:
non_outlier_data.head().transpose()

In [None]:
categorical_cols = [column for column in non_outlier_data.columns if (non_outlier_data[column].nunique()<=30)]
str_col = [col for col in categorical_cols if data[col].dtype =='O']
str_col

In [None]:
for i in str_col:
    print(i, ' :', non_outlier_data[i].unique())

In [None]:
non_outlier_data_ = non_outlier_data.copy()

In [None]:
for i in str_col:
    print(i)
    non_outlier_data = pd.concat([non_outlier_data.drop(i, axis = 1), pd.get_dummies(non_outlier_data[i], prefix=i, drop_first = True)], axis = 1)

In [None]:
non_outlier_data['Good_Student'] = non_outlier_data['avg_score'].copy()

In [None]:
def classify(x):
    if x > 15:
        return 1
    else:
        return 0

non_outlier_data['Good_Student'] = non_outlier_data['avg_score'].apply(classify)

In [None]:
non_outlier_data.head(10).transpose()

In [None]:
non_outlier_data.drop(['avg_score'], axis = 1,inplace=True)

In [None]:
# prepare the data to numpy array
final_feature_column=non_outlier_data.loc[:, non_outlier_data.columns != 'Good_Student'].columns.tolist()
final_feature_array = non_outlier_data[final_feature_column].to_numpy()
target_array = non_outlier_data['Good_Student'].to_numpy()

In [None]:
print('shape of final feature:',final_feature_array.shape)
print('shape of target:',target_array.shape)

In [None]:
# split the data into test and train
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    final_feature_array,
    target_array,
    train_size=0.8,
    random_state=0
    )

In [None]:
# check the shape data
print('Shape Data X Train:')
print(X_train.shape)
print('\nShape Data X Test:')
print(X_test.shape)
print('\nShape Data y Train:')
print(y_train.shape)
print('\nShape Data y Test:')
print(y_test.shape)

In [None]:
# load the algorithm
import sklearn.neighbors as neighbors
classification_model = neighbors.KNeighborsClassifier(n_neighbors=10)

In [None]:
# train the data
classification_model.fit(X_train, y_train)

In [None]:
# predict the X using trained model
y_train_pred = classification_model.predict(X_train)
y_test_pred = classification_model.predict(X_test)
target_array_pred = classification_model.predict(final_feature_array)

In [None]:
# check the raw prediction data & compare with real data
print('Real Data')
print(y_train[:5])
print('\n Predicted Data')
print(y_train_pred[:5])

In [None]:
# check the data in the form of dataframe
final_with_pred_data = non_outlier_data.copy()
final_with_pred_data['is_Good_Student_predicted'] = target_array_pred.reshape(-1,)
final_with_pred_data.head(5)

In [None]:
# evaluate classification model - accuracy
import sklearn.metrics as metrics
accuracy_training = metrics.accuracy_score(y_train,y_train_pred)
accuracy_test = metrics.accuracy_score(y_test,y_test_pred)

print('Accuracy Training Data: {}'.format(accuracy_training))
print('Accuracy Test Data: {}'.format(accuracy_test))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,y_train_pred))
print(confusion_matrix(y_test,y_test_pred))