In [1]:
#Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import statsmodels.api as sm
from collections import Counter

  from pandas.core import datetools


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline

In [4]:
sns.set(color_codes= True, palette= 'muted')

pd.options.display.max_columns= 1000
pd.options.display.max_rows= 1000

In [5]:
#data
train= pd.read_csv('train_dataset.csv')
test= pd.read_csv('test_dataset.csv')
sample= pd.read_csv('sample_submission.csv')

In [6]:
train.head()

Unnamed: 0,ID,Essayset,min_score,max_score,score_1,score_2,score_3,score_4,score_5,clarity,coherent,EssayText
0,1,1.0,0,3,1,1,1.0,1.0,1.0,average,worst,Some additional information that we would need...
1,2,1.0,0,3,1,1,,1.5,1.0,excellent,worst,"After reading the expirement, I realized that ..."
2,3,1.0,0,3,1,1,1.0,1.0,1.5,worst,above_average,"What you need is more trials, a control set up..."
3,4,1.0,0,3,0,0,0.0,0.0,1.0,worst,worst,The student should list what rock is better an...
4,5,1.0,0,3,2,2,2.0,2.5,1.0,above_average,worst,For the students to be able to make a replicat...


In [7]:
test.head()

Unnamed: 0,ID,Essayset,min_score,max_score,clarity,coherent,EssayText
0,1673,1,0,3,average,worst,The procedures I think they should have includ...
1,1674,1,0,3,average,worst,"In order to replicate this experiment, you wou..."
2,1675,1,0,3,above_average,above_average,"In order to replicate their experiment, you wo..."
3,1676,1,0,3,worst,worst,Pleace a simple of one material into one conta...
4,1677,1,0,3,worst,worst,Determin the mass of four different samples ma...


In [8]:
train.shape, test.shape, sample.shape

((17043, 12), (5224, 7), (99, 3))

In [9]:
train.isnull().sum().sort_values(ascending= False)

Essayset     157
score_3      147
coherent     145
score_5      144
clarity      138
score_4      136
EssayText      0
score_2        0
score_1        0
max_score      0
min_score      0
ID             0
dtype: int64

In [10]:
#Here, I'm going to drop null values
train_= train.copy()
test_= test.copy()
train= train.dropna()

In [11]:
train.head()

Unnamed: 0,ID,Essayset,min_score,max_score,score_1,score_2,score_3,score_4,score_5,clarity,coherent,EssayText
0,1,1.0,0,3,1,1,1.0,1.0,1.0,average,worst,Some additional information that we would need...
2,3,1.0,0,3,1,1,1.0,1.0,1.5,worst,above_average,"What you need is more trials, a control set up..."
3,4,1.0,0,3,0,0,0.0,0.0,1.0,worst,worst,The student should list what rock is better an...
4,5,1.0,0,3,2,2,2.0,2.5,1.0,above_average,worst,For the students to be able to make a replicat...
5,6,1.0,0,3,1,0,0.0,0.0,0.0,worst,worst,I would need the information of why you would ...


In [12]:
train['Essayset']= train['Essayset'].astype(int)

In [13]:
train['essay_score']= np.mean(train[['score_1', 'score_2', 'score_3', 'score_4', 'score_5']], axis= 1) 

In [14]:
train['essay_score']= np.round(train['essay_score']).astype(int)

In [15]:
Counter(train['essay_score'])

Counter({0: 6326, 1: 5409, 2: 3757, 3: 684})

In [16]:
#Now its become a multiclass classification problem
#Lets see what happened next
target= train['essay_score']

train= train.drop(['ID', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'essay_score'], axis= 1)

In [17]:
test.drop(['ID', 'min_score', 'max_score'], axis= 1, inplace= True)

In [18]:
train.drop(['min_score', 'max_score'], axis= 1, inplace= True)

In [19]:
train.shape, test.shape

((16176, 4), (5224, 4))

In [20]:
#Now I'm going to do text vectorization for the modeling purpose
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences, sequence

Using TensorFlow backend.


In [21]:
tokenizer= Tokenizer(num_words= 2000)

In [22]:
tokenizer.fit_on_texts(train['EssayText'].values)

In [23]:
train_et_vect= tokenizer.texts_to_sequences(train['EssayText'].values)

In [24]:
test_et_vect= tokenizer.texts_to_sequences(test['EssayText'].values)

In [25]:
train_et_vect= pad_sequences(train_et_vect, maxlen= 100)
test_et_vect= pad_sequences(test_et_vect, maxlen= 100)

In [26]:
type(train_et_vect)

numpy.ndarray

In [27]:
import re

def preprocessing_text_length(tweets):
    num_words= []
    num_char= []
    avg_word_len= []
    num_stopwords= []
    num_special_char= []
    num_upper_cases= []
    num_numerics= []

    for tweet in tweets:
        #remove links 
        #tweet= re.sub(r'(http|https|ftp)://[a-zA-Z0-9\./]+', '', tweet, flags= re.I)
        #tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
        
        tweet= tweet.strip().split()
        
        #num of words
        words= [w for w in tweet]
        num_words.append(len(words))
        
        #num_char
        chars= len(tweet)
        num_char.append(chars)
        
        #num_avg word length
        words_avg_len= [sum(len(w) for w in tweet)/len(words)]
        avg_word_len.append(words_avg_len[0])
        
        #number of stop words
        from nltk.corpus import stopwords
        stop= stopwords.words('english')
        stopword= [w for w in tweet if w in stop]
        num_stopwords.append(len(stopword))
        
        #number of special character
        hastags= [w for w in tweet if w.startswith('#')]
        num_special_char.append(len(hastags))
        
        #number of numerics
        numerics= [w for w in tweet if w.isdigit()]
        num_numerics.append(len(numerics))
        
        #number of upper cases
        upper_cases= [w for w in tweet if w.isupper()]
        num_upper_cases.append(len(upper_cases))
        
    return num_char, num_numerics, num_special_char, num_stopwords, num_upper_cases, num_words, avg_word_len

In [28]:
df_all= pd.concat([train, test])

In [30]:
df_all.head()

Unnamed: 0,Essayset,clarity,coherent,EssayText
0,1,average,worst,Some additional information that we would need...
2,1,worst,above_average,"What you need is more trials, a control set up..."
3,1,worst,worst,The student should list what rock is better an...
4,1,above_average,worst,For the students to be able to make a replicat...
5,1,worst,worst,I would need the information of why you would ...


In [31]:
num_chars, num_numeric, num_special_chars, num_stopword, num_upper_case, num_word, avg_words_len= preprocessing_text_length(df_all['EssayText'])

df_all['num_chars']= num_chars
df_all['num_numeric']= num_numeric
df_all['num_special_chars']= num_special_chars
df_all['num_stopword']= num_stopword
df_all['num_upper_case']= num_upper_case
df_all['num_word']= num_word
df_all['avg_words_len']= avg_words_len

In [32]:
_train= df_all[:len(train)]
_test= df_all[len(train):]

In [33]:
_train= _train.drop(['EssayText'], axis= 1)
_test= _test.drop(['EssayText'], axis= 1)

In [34]:
from scipy.sparse import csr_matrix

In [35]:
from sklearn.preprocessing import LabelEncoder

In [36]:
_train.head()

Unnamed: 0,Essayset,clarity,coherent,num_chars,num_numeric,num_special_chars,num_stopword,num_upper_case,num_word,avg_words_len
0,1,average,worst,56,0,0,28,0,56,4.625
2,1,worst,above_average,35,2,0,13,0,35,3.857143
3,1,worst,worst,17,0,0,9,0,17,4.235294
4,1,above_average,worst,31,0,0,15,0,31,4.129032
5,1,worst,worst,24,0,0,11,1,24,4.416667


In [37]:
for col in ['clarity', 'coherent']:
    encoder= LabelEncoder().fit(_train[col])
    _train[col]= encoder.transform(_train[col].astype(str))
    _test[col]= encoder.transform(_test[col].astype(str))

In [38]:
_train= _train.values
_test= _test.values

In [39]:
type(_train)

numpy.ndarray

In [40]:
_train.shape, train_et_vect.shape, _test.shape, test_et_vect.shape

((16176, 10), (16176, 100), (5224, 10), (5224, 100))

In [41]:
_train= np.hstack((_train, train_et_vect))
_test= np.hstack((_test, test_et_vect))

In [42]:
_train.shape, _test.shape

((16176, 110), (5224, 110))

In [43]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [44]:
from sklearn.metrics import accuracy_score, f1_score

In [45]:
clf_rf= RandomForestClassifier(random_state= 2019)

In [46]:
clf_rf.fit(_train, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=2019, verbose=0,
            warm_start=False)

In [47]:
prediction= clf_rf.predict(_train)

In [48]:
accuracy_score(target, prediction)

0.9889342235410484

In [49]:
prediction_1= clf_rf.predict(_test)

In [50]:
sample.head()

Unnamed: 0,id,essay_set,essay_score
0,1673,1,
1,1674,1,
2,1675,1,
3,1676,1,
4,1677,1,


In [51]:
sample_1= pd.DataFrame()

In [52]:
sample_1['id']= test_['ID']
sample_1['essay_set']= test_['Essayset']
sample_1['essay_score']= prediction_1

In [53]:
sample_1.head()

Unnamed: 0,id,essay_set,essay_score
0,1673,1,0
1,1674,1,1
2,1675,1,2
3,1676,1,0
4,1677,1,1


In [47]:
sample_1.to_csv('sample_6.csv', index= False)

In [54]:
import lightgbm as lgbm

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
def run_LGBM(train, target, test):
    np.random.seed(2019)
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, test_size= 0.20, random_state= 2019)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    dtrain= lgbm.Dataset(X_train, y_train, silent= False)
    dtest= lgbm.Dataset(X_test, y_test, silent= False)
    
    param_lgbm= {}
    param_lgbm['learning_rate']= 0.1
    param_lgbm['num_leaves']= 21
    param_lgbm['objective']= 'multiclass'
    param_lgbm['num_iterations']= 10000
    param_lgbm['seed']= 2019
    param_lgbm['num_class']= 7
    #param_lgbm['metric']= 'auc'
    #param_lgbm['is_unbalance']= True
    
    model= lgbm.train(param_lgbm, dtrain, valid_sets= (dtrain, dtest), valid_names= ('train', 'valid'), 
                      verbose_eval= 100, early_stopping_rounds= 50)
    
    prediction= model.predict(X_test)
    
    #prediction_= np.round(prediction).astype(int)
    
    #fpr, tpr, _ = roc_curve(y_test, prediction)
    
    #auc_= auc(fpr, tpr)
    #prediction= np.argsort(prediction)
    #prediction= prediction.flatten()
    prediction= [np.argmax(line) for line in prediction]
        
    
    print('Accuracy Score: {:.4f}' .format(accuracy_score(y_test, prediction)))
    #print('ROC AUC Score: {:.4f}' .format(roc_auc_score(y_test, prediction_)))
    print('f1 Score: {:.4f}' .format(f1_score(y_test, prediction, average= 'weighted')))
    #print('The ROC CURVE (AUC): {:.4f}' .format(auc_))
    
    return model.predict(test)

In [57]:
pred_lgbm_1= run_LGBM(_train, target, _test)

(12940, 110) (3236, 110) (12940,) (3236,)
Training until validation scores don't improve for 50 rounds.
[100]	train's multi_logloss: 0.355976	valid's multi_logloss: 0.513837
Early stopping, best iteration is:
[142]	train's multi_logloss: 0.305385	valid's multi_logloss: 0.512594
Accuracy Score: 0.7506
f1 Score: 0.7462


In [58]:
pred_lgbm_1= [np.argmax(line) for line in pred_lgbm_1]

In [59]:
Counter(pred_lgbm_1)

Counter({0: 2297, 1: 1267, 2: 1485, 3: 175})

In [60]:
sample_1['essay_score']= pred_lgbm_1

In [55]:
sample_1.to_csv('sample_7.csv', index= False)

In [58]:
from sklearn.model_selection import GroupKFold, KFold

In [62]:
def run_LGBM(X_train, X_test, y_train, y_test):
    np.random.seed(2019)
    
    #X_train, X_test, y_train, y_test= train_test_split(train, target, test_size= 0.30, random_state= 2019)
    #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    dtrain= lgbm.Dataset(X_train, y_train)
    dtest= lgbm.Dataset(X_test, y_test)
    
    param_lgbm= {}
    param_lgbm['learning_rate']= 0.03
    #param_lgbm['num_leaves']= 21
    param_lgbm['objective']= 'multiclass'
    param_lgbm['num_iterations']= 10000
    param_lgbm['seed']= 2019
    param_lgbm['num_class']= 4
    #param_lgbm['metric']= 'auc'
    #param_lgbm['is_unbalance']= True
    
    model= lgbm.train(param_lgbm, dtrain, valid_sets= (dtrain, dtest), valid_names= ('train', 'valid'), 
                      verbose_eval= 100, early_stopping_rounds= 50)
    
    prediction= model.predict(X_test)
    
    #prediction_= np.round(prediction).astype(int)
    
    #fpr, tpr, _ = roc_curve(y_test, prediction)
    
    #auc_= auc(fpr, tpr)
    #prediction= np.argsort(prediction)
    #prediction= prediction.flatten()
    prediction= [np.argmax(line) for line in prediction]
        
    
    print('Accuracy Score: {:.4f}' .format(accuracy_score(y_test, prediction)))
    #print('ROC AUC Score: {:.4f}' .format(roc_auc_score(y_test, prediction_)))
    print('f1 Score: {:.4f}' .format(f1_score(y_test, prediction, average= 'weighted')))
    #print('The ROC CURVE (AUC): {:.4f}' .format(auc_))
    
    return model, prediction

In [63]:
pred_train= np.zeros(len(target), dtype= int)

In [64]:
pred_test= []

In [65]:
fold= GroupKFold(n_splits= 5)

In [66]:
target_= target.copy()
target= target.values

In [67]:
for i, (train_idx, test_idx) in enumerate(fold.split(_train, target, train['Essayset'])):
    print('Fold: {}' .format(i+1))
    
    X_train, X_test= _train[train_idx], _train[test_idx]
    y_train, y_test= target[train_idx], target[test_idx]
    
    model, _pred_train= run_LGBM(X_train, X_test, y_train, y_test)
    
    _pred_test= model.predict(_test)
    
    pred_train[test_idx]= _pred_train
    
    _pred_test= [np.argmax(line) for line in _pred_test]
    pred_test.append(_pred_test)
    
    del model, _pred_train, _pred_test
    
print('Accuracy: {}' .format(accuracy_score(target, pred_train)))

Fold: 1
Training until validation scores don't improve for 50 rounds.
[100]	train's multi_logloss: 0.485261	valid's multi_logloss: 0.713873
[200]	train's multi_logloss: 0.35946	valid's multi_logloss: 0.664861
Early stopping, best iteration is:
[249]	train's multi_logloss: 0.325721	valid's multi_logloss: 0.661836
Accuracy Score: 0.6510
f1 Score: 0.6185
Fold: 2
Training until validation scores don't improve for 50 rounds.
[100]	train's multi_logloss: 0.494109	valid's multi_logloss: 0.710071
[200]	train's multi_logloss: 0.369293	valid's multi_logloss: 0.655519
[300]	train's multi_logloss: 0.305886	valid's multi_logloss: 0.648262
Early stopping, best iteration is:
[290]	train's multi_logloss: 0.310992	valid's multi_logloss: 0.647911
Accuracy Score: 0.6771
f1 Score: 0.6878
Fold: 3
Training until validation scores don't improve for 50 rounds.
[100]	train's multi_logloss: 0.495775	valid's multi_logloss: 0.683247
[200]	train's multi_logloss: 0.36474	valid's multi_logloss: 0.635057
[300]	train'

In [68]:
pred_test_1= np.round(np.mean(pred_test, axis= 0)).astype(int)

In [69]:
sample_1['essay_score']= pred_test_1

In [70]:
sample_1.to_csv('sample_10.csv', index= False)

In [59]:
import catboost as cb

In [60]:
def run_CB(train, target, test):
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, test_size= 0.25, random_state= 2019)
    
    model= cb.CatBoostClassifier(random_seed= 2019)
    
    model.fit(X_train, y_train)
    
    prediction= model.predict(X_test)
    
    print('Accuracy Score: {:.4f}' .format(accuracy_score(y_test, prediction)))
    #print('ROC AUC Score: {:.4f}' .format(roc_auc_score(y_test, prediction_)))
    print('f1 Score: {:.4f}' .format(f1_score(y_test, prediction, average= 'weighted')))
    #print('The ROC CURVE (AUC): {:.4f}' .format(auc_))
    
    return model.predict(test)

In [70]:
pred_cb_1= run_CB(_train, target, _test)

Learning rate set to 0.033953
0:	learn: 0.6530225	total: 232ms	remaining: 3m 51s
1:	learn: 0.6183651	total: 320ms	remaining: 2m 39s
2:	learn: 0.5870094	total: 418ms	remaining: 2m 19s
3:	learn: 0.5591670	total: 522ms	remaining: 2m 10s
4:	learn: 0.5366018	total: 611ms	remaining: 2m 1s
5:	learn: 0.5165120	total: 705ms	remaining: 1m 56s
6:	learn: 0.4988224	total: 809ms	remaining: 1m 54s
7:	learn: 0.4843510	total: 903ms	remaining: 1m 52s
8:	learn: 0.4711888	total: 1.01s	remaining: 1m 51s
9:	learn: 0.4596784	total: 1.11s	remaining: 1m 50s
10:	learn: 0.4489654	total: 1.2s	remaining: 1m 47s
11:	learn: 0.4402669	total: 1.3s	remaining: 1m 47s
12:	learn: 0.4330831	total: 1.4s	remaining: 1m 46s
13:	learn: 0.4254461	total: 1.49s	remaining: 1m 44s
14:	learn: 0.4177810	total: 1.59s	remaining: 1m 44s
15:	learn: 0.4106093	total: 1.69s	remaining: 1m 43s
16:	learn: 0.4053379	total: 1.78s	remaining: 1m 42s
17:	learn: 0.4015597	total: 1.9s	remaining: 1m 43s
18:	learn: 0.3970774	total: 2s	remaining: 1m 43s


In [71]:
Counter(pred_cb_1)

Counter({0.0: 2257, 1.0: 2967})

In [140]:
pred_cb_1= pred_cb_1.astype(int)

In [141]:
sample_1['essay_score']= pred_cb_1

In [142]:
sample_1.to_csv('sample_4.csv', index= False)

In [61]:
def run_ML(model, train, target, test):
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, test_size= 0.25, random_state= 2019)
    
    #model= cb.CatBoostClassifier(random_seed= 2019)
    
    model.fit(X_train, y_train)
    
    prediction= model.predict(X_test)
    
    print('Accuracy Score: {:.4f}' .format(accuracy_score(y_test, prediction)))
    #print('ROC AUC Score: {:.4f}' .format(roc_auc_score(y_test, prediction_)))
    print('f1 Score: {:.4f}' .format(f1_score(y_test, prediction, average= 'weighted')))
    #print('The ROC CURVE (AUC): {:.4f}' .format(auc_))
    
    return model.predict(test)

In [62]:
import xgboost as xgb

In [64]:
clf_xgb= xgb.XGBClassifier(random_state= 2019)

In [75]:
pred_xgb_1= run_ML(clf_xgb, _train, target, _test)

Accuracy Score: 0.7391
f1 Score: 0.7289


  if diff:
  if diff:


In [76]:
Counter(pred_xgb_1)

Counter({0: 2247, 1: 1311, 2: 1606, 3: 60})

In [77]:
sample_1['essay_score']= pred_xgb_1

In [78]:
sample_1.to_csv('sample_11.csv', index= False)

In [63]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC

In [80]:
clf_mnb= MultinomialNB()

In [81]:
pred_mnb_1= run_ML(clf_mnb, _train, target, _test)

Accuracy Score: 0.4500
f1 Score: 0.4216


In [82]:
clf_svc= SVC(random_state= 2019, C= 0.1)

In [83]:
pred_svc_1= run_ML(clf_svc, _train, target, _test)

Accuracy Score: 0.3934
f1 Score: 0.2222


In [80]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier

In [81]:
clf_xgb_bag= BaggingClassifier(clf_xgb, random_state= 2019)

In [83]:
pred_xgb_bag_1= run_ML(clf_xgb_bag, _train, target, _test)

Accuracy Score: 0.7374
f1 Score: 0.7282


In [65]:
_train1= df_all[:len(train)]
_test1= df_all[len(train):]

In [66]:
for col in ['clarity', 'coherent']:
    encoder= LabelEncoder().fit(_train1[col])
    _train1[col]= encoder.transform(_train1[col].astype(str))
    _test1[col]= encoder.transform(_test1[col].astype(str))

In [67]:
_train1.head()

Unnamed: 0,Essayset,clarity,coherent,EssayText,num_chars,num_numeric,num_special_chars,num_stopword,num_upper_case,num_word,avg_words_len
0,1,1,3,Some additional information that we would need...,56,0,0,28,0,56,4.625
2,1,3,0,"What you need is more trials, a control set up...",35,2,0,13,0,35,3.857143
3,1,3,3,The student should list what rock is better an...,17,0,0,9,0,17,4.235294
4,1,0,3,For the students to be able to make a replicat...,31,0,0,15,0,31,4.129032
5,1,3,3,I would need the information of why you would ...,24,0,0,11,1,24,4.416667


In [68]:
col_to_add= _train1.columns.tolist()

In [69]:
col_to_add.remove('EssayText')

In [70]:
_train1= _train1.rename(columns= {'EssayText': 'tweets'})
_test1= _test1.rename(columns= {'EssayText': 'tweets'})

In [71]:
from scipy.sparse import hstack

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [73]:
from sklearn.metrics import confusion_matrix, classification_report

In [74]:
def ml_modeling(model, train, target, test, countVectorizer= True, tfidfVectorizer= False, col_to_add= col_to_add):
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, random_state= 2019)
    
    if countVectorizer:
        vect= CountVectorizer().fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction, average= 'weighted')
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    if tfidfVectorizer:
        vect= TfidfVectorizer(min_df= 5, ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction, average= 'weighted')
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    return model.predict(test_vect)

In [109]:
pred_count_xgb_1= ml_modeling(clf_xgb, _train1, target, _test1)

X_train_vect (12132, 13140)
X_test_vect (4044, 13140)
test_vect (5224, 13140)


  if diff:
  if diff:


Training accuracy: 0.7905539070227497
Testing accuracy: 0.7700296735905044
f1 score: 0.7639548854643
Classification Report: 
             precision    recall  f1-score   support

          0       0.81      0.84      0.82      1591
          1       0.71      0.72      0.72      1355
          2       0.80      0.83      0.82       934
          3       0.59      0.23      0.33       164

avg / total       0.76      0.77      0.76      4044



  if diff:


In [110]:
pred_tfidf_xgb_1= ml_modeling(clf_xgb, _train1, target, _test1, False, True)

X_train_vect (12132, 28378)
X_test_vect (4044, 28378)
test_vect (5224, 28378)


  if diff:
  if diff:


Training accuracy: 0.8028354764259809
Testing accuracy: 0.7764589515331355
f1 score: 0.7712290475302479
Classification Report: 
             precision    recall  f1-score   support

          0       0.81      0.84      0.83      1591
          1       0.72      0.72      0.72      1355
          2       0.81      0.83      0.82       934
          3       0.64      0.27      0.38       164

avg / total       0.77      0.78      0.77      4044



  if diff:


In [111]:
Counter(pred_tfidf_xgb_1)

Counter({0: 2317, 1: 1236, 2: 1575, 3: 96})

In [114]:
sample_1['essay_score']= pred_tfidf_xgb_1

In [115]:
sample_1.to_csv('sample_12.csv', index= False)

In [116]:
sample_1['essay_score']= pred_count_xgb_1

In [117]:
sample_1.to_csv('sample_13.csv', index= False)

In [77]:
clf_lgbm= lgbm.LGBMClassifier(random_state= 2019)

In [120]:
pred_count_lgbm_1= ml_modeling(clf_lgbm, _train1, target, _test1)

X_train_vect (12132, 13140)
X_test_vect (4044, 13140)
test_vect (5224, 13140)


  if diff:
  if diff:


Training accuracy: 0.9096604022420046
Testing accuracy: 0.8039070227497527
f1 score: 0.8019038972517816
Classification Report: 
             precision    recall  f1-score   support

          0       0.84      0.87      0.85      1591
          1       0.77      0.75      0.76      1355
          2       0.83      0.84      0.83       934
          3       0.58      0.45      0.50       164

avg / total       0.80      0.80      0.80      4044



  if diff:


In [121]:
pred_tfidf_lgbm_1= ml_modeling(clf_lgbm, _train1, target, _test1, False, True)

X_train_vect (12132, 28378)
X_test_vect (4044, 28378)
test_vect (5224, 28378)


  if diff:
  if diff:


Training accuracy: 0.9413946587537092
Testing accuracy: 0.8009396636993076
f1 score: 0.7981759006688851
Classification Report: 
             precision    recall  f1-score   support

          0       0.83      0.88      0.85      1591
          1       0.77      0.74      0.76      1355
          2       0.81      0.83      0.82       934
          3       0.56      0.41      0.47       164

avg / total       0.80      0.80      0.80      4044



  if diff:


In [122]:
Counter(pred_count_lgbm_1), Counter(pred_tfidf_lgbm_1)

(Counter({0: 2254, 1: 1302, 2: 1448, 3: 220}),
 Counter({0: 2268, 1: 1294, 2: 1463, 3: 199}))

In [123]:
sample_1['essay_score']= pred_count_lgbm_1
sample_1.to_csv('sample_14.csv', index= False)

In [124]:
sample_1['essay_score']= pred_tfidf_lgbm_1
sample_1.to_csv('sample_15.csv', index= False)

In [82]:
def ml_modeling(model, train, target, test, countVectorizer= True, tfidfVectorizer= False, col_to_add= col_to_add):
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, random_state= 2019)
    
    if countVectorizer:
        vect= CountVectorizer(ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction, average= 'weighted')
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    if tfidfVectorizer:
        vect_word= TfidfVectorizer(min_df= 5, ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_vect= vect_word.transform(X_train['tweets'])
        X_test_vect= vect_word.transform(X_test['tweets'])
        test_vect= vect_word.transform(test['tweets'])
        
        vect_char= TfidfVectorizer(min_df= 5, analyzer= 'char', ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_char= vect_char.transform(X_train['tweets'])
        X_test_char= vect_char.transform(X_test['tweets'])
        test_char= vect_char.transform(test['tweets'])
        
        X_train_vect= hstack((X_train_vect, X_train_char))
        X_test_vect= hstack((X_test_vect, X_test_char))
        test_vect= hstack((test_vect, test_char))
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction, average= 'weighted')
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    return model.predict(test_vect), X_train_vect, X_test_vect, test_vect

In [83]:
pred_tfidf_lgbm_2, train_vect, valid_vect, test_vect= ml_modeling(clf_lgbm, _train1, target, _test1, False, True)

X_train_vect (12132, 34889)
X_test_vect (4044, 34889)
test_vect (5224, 34889)


  if diff:
  if diff:


Training accuracy: 0.976178700956149
Testing accuracy: 0.8118199802176064
f1 score: 0.809761500545742
Classification Report: 
             precision    recall  f1-score   support

          0       0.85      0.88      0.86      1591
          1       0.78      0.76      0.77      1355
          2       0.82      0.84      0.83       934
          3       0.58      0.45      0.50       164

avg / total       0.81      0.81      0.81      4044



  if diff:


In [84]:
sample_1['essay_score']= pred_tfidf_lgbm_2

In [85]:
sample_1.to_csv('sample_17.csv', index= False)

In [86]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU, Activation
from keras.losses import binary_crossentropy, categorical_crossentropy, sparse_categorical_crossentropy
from keras.optimizers import SGD, Adam

In [90]:
#Created a function that vectorize the nlp data
def vectorize_data(train, test, col_to_add= col_to_add):
    #only using tfidf vectorizer
    vect_word= TfidfVectorizer(min_df= 5, ngram_range= (1, 3)).fit(train['tweets'])
    train_vect= vect_word.transform(train['tweets'])
    #test_vect= vect_word.transform(X_test['tweets'])
    test_vect= vect_word.transform(test['tweets'])

    vect_char= TfidfVectorizer(min_df= 5, analyzer= 'char', ngram_range= (1, 3)).fit(train['tweets'])
    train_char= vect_char.transform(train['tweets'])
    #X_test_char= vect_char.transform(X_test['tweets'])
    test_char= vect_char.transform(test['tweets'])

    train_vect= hstack((train_vect, train_char))
    #X_test_vect= hstack((X_test_vect, X_test_char))
    test_vect= hstack((test_vect, test_char))
    
    #now add the columns to the sparse matrix using the scipy library
    for col in col_to_add:
        train_vect= hstack((train_vect, np.array(train[col])[:, None]))
        #X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
        test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
    return train_vect, test_vect

In [91]:
_train_vect, _test_vect= vectorize_data(_train1, _test1)

In [92]:
_train_vect.shape, _test_vect.shape, type(_train_vect), type(_test_vect)

((16176, 43667),
 (5224, 43667),
 scipy.sparse.coo.coo_matrix,
 scipy.sparse.coo.coo_matrix)

In [93]:
_train_vect= csr_matrix(_train_vect)
#valid_vect= csr_matrix(valid_vect)
_test_vect= csr_matrix(_test_vect)

In [94]:
type(_train_vect), type(_test_vect), _train_vect.shape, _test_vect.shape

(scipy.sparse.csr.csr_matrix,
 scipy.sparse.csr.csr_matrix,
 (16176, 43667),
 (5224, 43667))

In [95]:
#now using this data in NN
X_train, X_test, y_train, y_test= train_test_split(_train_vect, target, test_size= 0.25, random_state= 2019)

In [96]:
input_shape= X_train.shape[1]

In [101]:
model= Sequential()
model.add(Dense(units= 100, activation= 'relu', input_dim= input_shape, kernel_initializer= 'uniform'))
model.add(Dense(units= 20, activation= 'relu'))
#model.add(Dense(units= 20, activation= 'relu'))
model.add(Dense(units= 4, activation= 'softmax'))

#sgd= SGD(lr= 0.1)

model.compile(optimizer= 'adam', loss= 'sparse_categorical_crossentropy', metrics= ['accuracy'])
model.fit(x= X_train, y= y_train, batch_size= 128, epochs= 15, shuffle= False,
          validation_data= (X_test, y_test))

y_pred= model.predict_classes(X_test, batch_size= 128)

#print accuracies
train_acc= model.evaluate(X_train, y_train, batch_size= 128)
test_acc= accuracy_score(y_test, y_pred)
f1= f1_score(y_test, y_pred, average= 'weighted')

print('Detail of training losses and accuracies: ', train_acc)
print('Testing accuracy: {}' .format(test_acc))
print('f1 score: {}' .format(f1))

y_pred_real= model.predict_classes(_test_vect)

Train on 12132 samples, validate on 4044 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Detail of training losses and accuracies:  [0.029930579001952957, 0.9953016815034619]
Testing accuracy: 0.7391196834817013
f1 score: 0.7288480121385342


In [102]:
Counter(y_pred_real)

Counter({0: 2357, 1: 1050, 2: 1595, 3: 222})

In [103]:
sample_1['essay_score']= y_pred_real

In [104]:
sample_1.to_csv('sample_18.csv', index= False)

In [105]:
def run_LGBM(X_train, X_test, y_train, y_test):
    np.random.seed(2019)
    
    #X_train, X_test, y_train, y_test= train_test_split(train, target, test_size= 0.30, random_state= 2019)
    #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    dtrain= lgbm.Dataset(X_train, y_train)
    dtest= lgbm.Dataset(X_test, y_test)
    
    param_lgbm= {}
    param_lgbm['learning_rate']= 0.03
    #param_lgbm['num_leaves']= 21
    param_lgbm['objective']= 'multiclass'
    param_lgbm['num_iterations']= 10000
    param_lgbm['seed']= 2019
    param_lgbm['num_class']= 4
    #param_lgbm['metric']= 'auc'
    #param_lgbm['is_unbalance']= True
    
    model= lgbm.train(param_lgbm, dtrain, valid_sets= (dtrain, dtest), valid_names= ('train', 'valid'), 
                      verbose_eval= 100, early_stopping_rounds= 50)
    
    prediction= model.predict(X_test)
    
    #prediction_= np.round(prediction).astype(int)
    
    #fpr, tpr, _ = roc_curve(y_test, prediction)
    
    #auc_= auc(fpr, tpr)
    #prediction= np.argsort(prediction)
    #prediction= prediction.flatten()
    prediction= [np.argmax(line) for line in prediction]
        
    
    print('Accuracy Score: {:.4f}' .format(accuracy_score(y_test, prediction)))
    #print('ROC AUC Score: {:.4f}' .format(roc_auc_score(y_test, prediction_)))
    print('f1 Score: {:.4f}' .format(f1_score(y_test, prediction, average= 'weighted')))
    #print('The ROC CURVE (AUC): {:.4f}' .format(auc_))
    
    return model

In [106]:
model_lgbm_1= run_LGBM(X_train, X_test, y_train, y_test)

Training until validation scores don't improve for 50 rounds.
[100]	train's multi_logloss: 0.431488	valid's multi_logloss: 0.529365
[200]	train's multi_logloss: 0.271851	valid's multi_logloss: 0.446231
[300]	train's multi_logloss: 0.196209	valid's multi_logloss: 0.424277
[400]	train's multi_logloss: 0.148319	valid's multi_logloss: 0.417715
[500]	train's multi_logloss: 0.114714	valid's multi_logloss: 0.415945
Early stopping, best iteration is:
[506]	train's multi_logloss: 0.113005	valid's multi_logloss: 0.41578
Accuracy Score: 0.8140
f1 Score: 0.8116


In [107]:
pred_tfidf_lgbm_3= model_lgbm_1.predict(_test_vect)

In [108]:
pred_tfidf_lgbm_3= [np.argmax(line) for line in pred_tfidf_lgbm_3]

In [109]:
Counter(pred_tfidf_lgbm_3)

Counter({0: 2266, 1: 1304, 2: 1467, 3: 187})

In [110]:
sample_1['essay_score']= pred_tfidf_lgbm_3

In [111]:
sample_1.to_csv('sample_19.csv', index= False)