In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)



import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',500)
pd.set_option('display.float_format', lambda x: '%.2f' % x)



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [None]:
#reading data
raw_data = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')

## DATA ANALYSIS 

In [None]:
raw_data.shape

In [None]:
raw_data.head(5)

In [None]:
raw_data.describe()

### MISSING DATA

In [None]:
# assessment of how much missing data there is in each column of the dataset
column_miss = {}
for x in raw_data:
    column_miss[x] = raw_data[x].isna().sum()/len(raw_data.index)


In [None]:
# patterns in the amount of missing data in each column. No missing data found
sorted(column_miss.items(), key=lambda x: x[1], reverse=True)


### DISTRIBUTION

In [None]:
#PREPARING VARIABLES
categoric = raw_data.select_dtypes(include = ['object'])
categoric_cols=categoric.columns

cont = raw_data.select_dtypes(include = ['int64','int32','float64'])
cont_cols=cont.columns


In [None]:
#number of unique categoric values
raw_data[categoric_cols].nunique()

In [None]:
#number of unique continuous values
raw_data[cont_cols].nunique()

In [None]:
#student id unique check
raw_data[raw_data['student_id'].duplicated()]

In [None]:
def hist_plotter(col):
    '''
    takes in column name as argument and plots histogram on raw_data dataframe for that column
    '''

    plt.figure(figsize=(10,6))
    plt.hist(raw_data[col])
    plt.title(col+" histogram")

In [None]:
#looping through continuous columns to plot histograms 
for col in cont_cols:
    hist_plotter(col)

In [None]:
cat_cols   = raw_data.nunique()[raw_data.nunique() < 24].keys().tolist() #less than 24 distinct values in a column


def plot_pie(column) :
    '''
    Takes in column name.
    plots two pie graphs the column: population distribution and average target distribution
    '''
    trace1 = go.Pie(values  = raw_data[column].value_counts().values.tolist(),
                    labels  = raw_data[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.4]),
                    name = "Population Distribution",
                    marker  = dict(line = dict(width = 2,

                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )

    trace2 = go.Pie(values  = raw_data[[column, 'posttest']].groupby(column).mean()['posttest'].tolist(),
                    labels  = raw_data[column].value_counts().keys().tolist(),
                    name = "Average Posttest",
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [.5,1]),
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6,
                    textinfo='value'
                   )

    layout = go.Layout(dict(title = column + " Distribution",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "Population Distribution",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .1, y = .5), #position of titles in cicle(left)
                                           dict(text = "Average Posttest",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .82, y = .5 #position of titles in cicle(right)
                                               )
                                               
                                          ]
                           )
                      )
    data = [trace1, trace2]
    fig  = go.Figure(data = data, layout = layout)
    py.iplot(fig)

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

#loops through categoric variables
for i in cat_cols :
    plot_pie(i)

### CORRELATION

In [None]:
# Correlation Matrix
plt.figure(figsize=(5,5))

corr = raw_data.corr()

f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

## MODELLING

### base model with two continuos variables

In [None]:
X_reg = raw_data[['pretest', 'n_student']] #features as only continuos variables
y_reg = raw_data['posttest']  #target
#test train split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size = 0.33, random_state = 17)

#linear regression training
reg = LinearRegression().fit(X_train_reg, y_train_reg)

#predicting test data
y_pred_reg = reg.predict(X_test_reg)

#base performance
mean_absolute_error(y_test_reg, y_pred_reg)

### preparing for lightgbm

In [None]:
modelling_data = raw_data.drop(columns =['student_id']) #dropping id column
dummied_data = pd.get_dummies(modelling_data) #converting categoric features to dummies 

In [None]:
dummied_data.describe()

In [None]:
#splitting data to train and test with 33% test and 67% train
X = dummied_data.drop(['posttest'], axis=1) #X as all features but target
y = dummied_data['posttest']  #target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 17)

In [None]:
#using standardscaler to both train and test features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# lgb params
params = {'boosting_type':'gbdt',
          'objective':'mae', 
          'metric':'mean_absolute_error', 
          'max_depth':-1, 
          'num_leaves':9, 
          'colsample_bytree' : 0.7,
         'learning_rate':0.03,
         'n_estimators': 500,
          'random_state': 42,
          'verbose': -1}
 
# create dataset for lightgbm with scaled data
lgb_train_scaled = lgb.Dataset(X_train_scaled, label = y_train, params={'verbose': -1},)
lgb_test_scaled = lgb.Dataset(X_test_scaled, y_test, params={'verbose': -1},)

# create dataset for lightgbm non-scaled data
lgb_train = lgb.Dataset(X_train, label = y_train, params={'verbose': -1},)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train, params={'verbose': -1},)


In [None]:
#creating a lightgbm model with both scaled and normal features
lgb_model_scaled = lgb.train(params, lgb_train_scaled, valid_sets=[lgb_train_scaled, lgb_test_scaled], verbose_eval=50)
lgb_model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], verbose_eval=50 )


In [None]:
#checking results of both scaled and normal features
y_pred_test_scaled = lgb_model_scaled.predict(X_test_scaled)
y_pred_test = lgb_model.predict(X_test)
print("scaled lightbm results: ", mean_absolute_error(y_test, y_pred_test_scaled))
print("lightbm results: ", mean_absolute_error(y_test, y_pred_test))
#no change with scaled features

In [None]:
#model details, 20 most important variables and their gains
gain = lgb_model.feature_importance('gain') 
feature_importance = pd.DataFrame({'feature':X.columns,
             'split':lgb_model.feature_importance('split'),
             'gain':100 *gain/gain.sum()}).sort_values(by='gain', ascending = False
)
feature_importance.head(20) #top 20 variables

In [None]:
#finding optimal number of features with forward selection 
#orders feature with gain order from the lightgbm model that uses all features
#starts with first feature and creates a model with only 1 feature
#adds a new feature and creates a model, iterates until all variables are used
#saves results of number of variable used and performance of that model
results = []
for i in range(1,feature_importance.shape[0]): 
    lgb_new_train = lgb.Dataset(X_train[list(feature_importance['feature'][:i])], label = y_train)
    lgb_new_test = lgb.Dataset(X_test[list(feature_importance['feature'][:i])], y_test, reference=lgb_train)
    lgb_new_model = lgb.train(params, lgb_new_train, valid_sets=[lgb_new_train, lgb_new_test], verbose_eval=-1)
    y_pred_new_test = lgb_new_model.predict(X_test[list(feature_importance['feature'][:i])])
    results.append([i,mean_absolute_error(y_test, y_pred_new_test)])
results_df = pd.DataFrame(results, columns=['number_of_variables', 'mae'])  

In [None]:
results_df.head(20)

In [None]:
#plotting number of variables and mae results
plt.figure(figsize=(20,20))
plt.plot(results_df['mae'])
plt.title("mae with number of variables")
plt.xlabel('number of variables')
plt.ylabel('mae')
plt.legend()
plt.show()

In [None]:
#final model 
lgb_final_train = lgb.Dataset(X_train[list(feature_importance['feature'][:6])], label = y_train)
lgb_final_test = lgb.Dataset(X_test[list(feature_importance['feature'][:6])], y_test, reference=lgb_train)
lgb_new_model = lgb.train(params, lgb_final_train, valid_sets=[lgb_final_train, lgb_final_test], verbose_eval=-1)
y_pred_final_test = lgb_new_model.predict(X_test[list(feature_importance['feature'][:6])])
mean_absolute_error(y_test, y_pred_final_test)

In [None]:
#final model details 
gain_final = lgb_new_model.feature_importance('gain') 
feature_importance_final = pd.DataFrame({'feature':X_train[list(feature_importance['feature'][:6])].columns,
             'split':lgb_new_model.feature_importance('split'),
             'gain':100 *gain_final/gain_final.sum()}).sort_values(by='gain', ascending = False
)
feature_importance_final.head(20) #top 20 variables

In [None]:
#combining actual and pred in same dataframe
y_test_df = y_test.reset_index()
y_test_df['pred'] = y_pred_final_test
y_test_df.head(20)

In [None]:
#plotting differences of predicted and actual scores
plt.figure(figsize=(20,20))
plt.hist(y_test_df['posttest']-y_test_df['pred'], label='diffs')
plt.title("prediction vs actual")
plt.legend()
plt.show()