In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

scores = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
scores.head()

In [None]:
scores.info()

In [None]:
scores.describe()

In [None]:
print('Duplicates:', scores.duplicated().sum())


In [None]:
missing = 100 * scores.isnull().mean()
missing[missing.values > 0].sort_values(ascending=False)

In [None]:
# Numeric features
numeric_feats = scores.dtypes[scores.dtypes != "object"].index
# Categorical features
cat_feats = scores.dtypes[scores.dtypes == "object"].index

print('Numerical features length:', len(numeric_feats))
print('Categorical features length:', len(cat_feats))

In [None]:

cat = scores.select_dtypes(include='object')
scores_cat = scores.loc[:,cat.columns]
scores_cat.head(10)

In [None]:
val = scores["teaching_method"].unique()
val

In [None]:
fig, axes = plt.subplots(1,4)
scores['school_setting'].value_counts().plot(kind='bar', ax=axes[0], figsize=(16,6))
scores['gender'].value_counts().plot(kind='bar', ax=axes[1])
scores['teaching_method'].value_counts().plot(kind='bar', ax=axes[2])
scores['school_type'].value_counts().plot(kind='bar', ax=axes[3])

fig.suptitle('Glances From the Dataset')
axes[0].set_title('Distribution of School Location')
axes[1].set_title('Sex Distribution')
axes[2].set_title('Standard VS. Experimental Studies')
axes[3].set_title('Distribution of School Type')
plt.tight_layout()


In [None]:
scores_cat['lunch'] = scores_cat['lunch'].replace('Does not qualify', 0)
scores_cat['lunch'] = scores_cat['lunch'].replace('Qualifies for reduced/free lunch', 1)


scores_cat['school_type'] = scores_cat['school_type'].replace('Non-public', 0)
scores_cat['school_type'] = scores_cat['school_type'].replace('Public', 1)


scores_cat['teaching_method'] = scores_cat['teaching_method'].replace('Standard', 0)
scores_cat['teaching_method'] = scores_cat['teaching_method'].replace('Experimental', 1)

scores_cat['school_setting'] = scores_cat['school_setting'].replace('Rural', 0)
scores_cat['school_setting'] = scores_cat['school_setting'].replace('Suburban', 1)
scores_cat['school_setting'] = scores_cat['school_setting'].replace('Urban', 2)

scores_cat.head()

In [None]:
num_4 =  scores_cat[['school_setting','school_type','teaching_method','lunch']]
num_4

In [None]:
num_3 =  scores[['n_student','pretest','posttest']]
num_3

In [None]:
data = pd.concat([num_4,num_3], axis = 1)
data

In [None]:
df=data.corr()
sns.set(rc = {'figure.figsize':(15,8)})
sns.heatmap(df, annot=True)

In [None]:
regplot = smf.ols('posttest ~ pretest + teaching_method', data=scores).fit()
print(regplot.summary())

In [None]:
x = data[['pretest','teaching_method']]
y =data['posttest']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)

print('the amount of values for test data are', x_test.shape[0])
print('the amount of values for train data are', x_train.shape[0])

###### 

In [None]:
lre = LinearRegression()
lre.fit(x_train, y_train)

print('The R-sqaured for the test data is', lre.score(x_test, y_test))

In [None]:
rcross = cross_val_score(lre, x,y, cv=4)
print('The average R-sqaured for all 4 samples of the data is', rcross.mean())

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(x_train, y_train)

# y_val_preds = knn.predict(X_val)

print(knn.score(x_test, y_test))

In [None]:
rf = RandomForestRegressor(random_state=42)

rf.fit(x_train, y_train)

# y_val_preds = knn.predict(X_val)
print(rf.score(x_test, y_test))