In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows',100)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

%matplotlib inline

# Data Reading and Exploratory

In [None]:
df = pd.read_csv(os.path.join(dirname, filename))

print(df.shape)
print(df.info())
df

In [None]:
# Outlier
print(df.describe()) # continuous variables
df.describe(include=object)

In [None]:
# outlier checking
df.boxplot(by ='school', column =['posttest'], grid = False)

In [None]:
df.boxplot(by ='classroom', column =['posttest'], grid = False)

In [None]:
# How many students come from the same schools?
df.groupby("school")["student_id"].agg("count")

# How many students come from the same classes in the same schools?
df.groupby(["school", "classroom"])["student_id"].agg("count")

# How many students come from the same classes in the same schools?

In [None]:
df["posttest"].plot.kde()

In [None]:
df["pretest"].plot.kde()

In [None]:
sns.pairplot(df, hue='gender')

# Feature Selection

## Numerical Variables

In [None]:
df_small = df.copy()
df_small.drop(["school", "student_id"], axis=1, inplace=True)
print(df_small)

cor = df_small.corr()

plt.figure(figsize=(12,10))
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

## Encoding Categorical Variables

In [None]:
# Get all character columns
x_category = df.select_dtypes(include = "object")
x_category.drop(["student_id"], axis = 1, inplace = True)
x_category

In [None]:
# checking classes within each categorical variable
col_list = x_category.columns

def unique_col(col):
    print(col)
    print(pd.unique(x_category[col].values.ravel("K")))
    
[unique_col(col) for col in col_list]
#[np.unique(col) for col in x_category.columns.values]

In [None]:
corr_cat = x_category.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
corr_cat

In [None]:
# removing one variable from the dataframe because it correlates highly to school, and it has more levels than school
df.drop("classroom", axis = 1, inplace = True)

In [None]:
# Encoding categorical variables
x_cat_small = x_category.drop(["school", "classroom"], axis = 1)

X = pd.get_dummies(data = x_cat_small, drop_first = True)
print(X.head())
X.shape

In [None]:
X["n_student"] = df["n_student"].values
X["pretest"] = df["pretest"].values

## Muation
New variable from school-variable

In [None]:
# variables mutation
# Transform classroom and school variables because of many levels and their importance
# classroom_encode = pd.DataFrame(df.groupby("classroom")["posttest"].mean())
# classroom_encode.reset_index(inplace=True)
school_pretest_median = pd.DataFrame(df.groupby("school")["pretest"].median())
school_pretest_median.reset_index(inplace=True)
school_pretest_median
school_pretest_median = school_pretest_median.rename(columns = {"pretest": "grouped_school_pret"})
school_pretest_median

In [None]:
school_pretest_median.sort_values(by = "grouped_school_pret")

In [None]:
school_pretest_median[school_pretest_median["school"] == "ZOWMK"]

In [None]:
df = df.merge(school_pretest_median, on = "school", how = "left")
print(df)
X["grouped_school_pret"] = df["grouped_school_pret"].values
X

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,10))
cor_no_y = X.corr()
sns.heatmap(cor_no_y, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
x_cor = X.copy()
x_cor["posttest"] = df["posttest"].values
x_cor

In [None]:
x_cor.corr()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(x_cor.corr(), annot=True, cmap=plt.cm.Reds)
plt.show()

# Train Model

In [None]:
y = df_small["posttest"].values
X.drop("grouped_school_pret", axis = 1, inplace = True)
X

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(123)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)

## Fiting Linear Regression Model

In [None]:
from sklearn import linear_model

# fit a model
lm = linear_model.LinearRegression()

model = lm.fit(x_train, y_train)
predictions = lm.predict(x_test)

In [None]:
## The line / model
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")

In [None]:
sns.regplot(y_test, predictions)

In [None]:
print("Score:", model.score(x_test, y_test))

In [None]:
import statsmodels.api as sm

x_train_Sm = sm.add_constant(x_train)
x_train_Sm = sm.add_constant(x_train)
ls = sm.OLS(y_train,x_train_Sm).fit()
print(ls.summary())

In [None]:
coeff_parameter = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_parameter