In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set_style('white')

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

In [None]:
train_df.describe().T

In [None]:
train_df.info()

In [None]:
from ydata_profiling import ProfileReport
report = ProfileReport(train_df)
report

In [None]:
train_df.isnull().sum()

In [None]:
#duplicates
train_df.drop_duplicates(inplace=True)

## EDA

In [None]:
from sklearn.preprocessing import LabelEncoder
lbl_en = LabelEncoder()
train_df['Target'] = lbl_en.fit_transform(train_df['Target'])

In [None]:
#correlation with target
train_df.select_dtypes(include='number').corr()['Target'].drop('Target').sort_values(ascending=False).plot(kind='bar')

In [None]:
#correlational heatmap
sns.heatmap(train_df.select_dtypes(include='number').corr(),cmap='magma',linecolor='white',linewidths=0.725)

In [None]:
#categorical columns
cat_cols = ['Marital status','Course','Daytime/evening attendance','Previous qualification','Nacionality',"Mother's qualification", "Father's qualification","Mother's occupation", "Father's occupation",'Displaced', 'Educational special needs', 'Debtor','Tuition fees up to date', 'Gender', 'Scholarship holder','International','Target']
num_cols = ['Application mode','Application order','Previous qualification (grade)','Admission grade','Age at enrollment','Curricular units 1st sem (credited)','Curricular units 1st sem (enrolled)','Curricular units 1st sem (evaluations)',
        'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',
       'Inflation rate', 'GDP']

In [None]:
#transforming marital status into binary 0 if 1 and 1 for 2,3,4,5,6
train_df['Marital status'] = train_df['Marital status'].apply(lambda x : 0 if x == 1 else 1)

In [None]:
train_df['Marital status'].value_counts()

In [None]:
#mapping course code to numbers
train_df['Course'] = lbl_en.fit_transform(train_df['Course'])

In [None]:
#plotting distribution plots for numerical columns
num_plots = len(num_cols)
num_rows = (num_plots+1)//2
fig, axes = plt.subplots(nrows=num_rows,ncols=2,figsize=(20,20))
for i,column in enumerate(num_cols):
    row = i//2
    col = i%2
    ax = axes[row,col]
    sns.histplot(train_df[column],ax=ax,kde=True,bins=40)
    ax.set_title(f'Histplot of {column}')
    ax.set_xlabel(column)
plt.tight_layout()
plt.show()

In [None]:
#plotting box plots for numerical columns
num_plots = len(num_cols)
num_rows = (num_plots+1)//2
fig, axes = plt.subplots(nrows=num_rows,ncols=2,figsize=(20,20))
for i,column in enumerate(num_cols):
    row = i//2
    col = i%2
    ax = axes[row,col]
    sns.boxplot(x = train_df[column],ax=ax)
    ax.set_title(f'Boxplot of {column}')
    ax.set_xlabel(column)
plt.tight_layout()
plt.show()

In [None]:
#plotting distribution plots for numerical columns
num_plots = len(cat_cols)
num_rows = (num_plots+1)//2
fig, axes = plt.subplots(nrows=num_rows,ncols=2,figsize=(20,20))
for i,column in enumerate(cat_cols):
    row = i//2
    col = i%2
    ax = axes[row,col]
    sns.histplot(train_df[column],ax=ax,bins=40)
    ax.set_title(f'Countplot of {column}')
    ax.set_xlabel(column)
plt.tight_layout()
plt.show()

In [None]:
#exploring target column
sns.countplot(x='Target',data=train_df,palette='viridis')

In [None]:
train_df.drop('id',axis=1,inplace=True)
test_df.drop('id',axis=1,inplace=True)

In [None]:
train_df['Target'].value_counts()
rev_map = {2 : "Graduate",0:"Dropout",1:"Enrolled"}

In [None]:
#preprocessing
X_train = train_df.drop('Target',axis=1)
y_train = train_df['Target']
X_test = test_df

In [None]:
from sklearn.preprocessing import StandardScaler
st_sclr = StandardScaler()
X_train = st_sclr.fit_transform(X_train)
#not assuming prior info about test set
X_test = st_sclr.transform(X_test)

### Submission function

In [None]:
def pred_to_csv(prediction,model_name):
    new_test_df = pd.read_csv("test.csv")
    pred_df = pd.DataFrame(prediction,columns=["Target"])
    pred_df = pd.concat([pred_df,new_test_df['id']],axis=1)
    pred_df.set_index("id",inplace=True)
    pred_df = pred_df.replace(rev_map)
    pred_df.to_csv("ps_s4_e6_"+str(model_name)+".csv")

### Fitting models

In [None]:
feature_df = pd.DataFrame(cat_clf.feature_importances_,index=train_df.drop('Target',axis=1).columns,columns=['importance'])

In [None]:
feature_df.sort_values(by='importance',ascending=False).plot(kind='bar')

In [None]:
X_train_2 = train_df.drop(['Target','Debtor', 'Application order',
       'Previous qualification', 'Curricular units 1st sem (credited)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (without evaluations)',
       'Curricular units 1st sem (without evaluations)',
       'Daytime/evening attendance', 'Marital status', 'Nacionality',
       'International', 'Educational special needs'],axis=1)
y_train_2 = train_df['Target']
X_test_2 = test_df.drop(['Debtor', 'Application order',
       'Previous qualification', 'Curricular units 1st sem (credited)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (without evaluations)',
       'Curricular units 1st sem (without evaluations)',
       'Daytime/evening attendance', 'Marital status', 'Nacionality',
       'International', 'Educational special needs'],axis=1)

### Neural Network

In [None]:
from sklearn.preprocessing import StandardScaler
st_sclr = StandardScaler()
X_train_2 = st_sclr.fit_transform(X_train_2)
#not assuming prior info about test set
X_test_2 = st_sclr.transform(X_test_2)

In [None]:
from keras.utils import to_categorical
X_train_NN = X_train_2
y_train_NN = to_categorical(y_train_2)

In [None]:
type(y_train_NN)

In [None]:
from sklearn.model_selection import train_test_split
X_train_NN,X_val_NN,y_train_NN,y_val_NN = train_test_split(X_train_NN,y_train_NN,test_size=0.2)

### Creating model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Defining EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',mode='min',patience=25)
#model
model = Sequential()
#layer 1 -> 24
model.add(Dense(24,activation='relu'))
model.add(Dropout(0.15))
#layer 2 -> 12
model.add(Dense(12,activation='relu'))
model.add(Dropout(0.15))
#layer 3-> 6
model.add(Dense(6,activation='relu'))
model.add(Dropout(0.15))
#layer 4 -> outputabs
model.add(Dense(3,activation='softmax'))

#compiling model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics = ['accuracy'])

#### Training model

In [None]:
model.fit(x = X_train_NN,
          y = y_train_NN,
          validation_data = (X_val_NN,y_val_NN),
          callbacks=early_stop,
          epochs = 250,
          batch_size=512)

In [None]:
predictions = model.predict(X_test_2)

In [None]:
pred_NN = np.argmax(predictions,axis=1)

In [None]:
pred_to_csv(pred_NN,"NN2")