In [12]:
# Import libraries/frameworks
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras

In [13]:
df = pd.read_excel( 'train_final.xlsx' )
test_data = pd.read_excel( 'test_final.xlsx' )
test_target = pd.read_excel( 'gold_final.xlsx' )
test_data = pd.merge( test_data, test_target, on = 'ItemNum' ) 
test_data.fillna( 0, inplace=True) 

df.fillna(0, inplace=True)
df.head(2)

Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time
0,91,"Over 1 year, a study is conducted to assess th...",Case-control study,Crossover study,Open-labeled clinical trial,Randomized clinical trial,"Single-blind, randomized, controlled trial",0,0,0,0,0,C,Open-labeled clinical trial,Text,STEP 1,0.86,111.21
1,288,A previously healthy 52-year-old woman comes t...,Calcitriol production by activated macrophages,Local resorption of bone by metastases,Parathyroid hormone-related peptide secretion,Secretion of parathyroid hormone,Secretion of thyroid-stimulating hormone,0,0,0,0,0,A,Calcitriol production by activated macrophages,Text,STEP 1,0.44,83.94


In [14]:
# https://bergen.edu/ELRC/connectingwords.html#:~:text=ADDITION%3A%20also%2C%20besides%2C%20equally,%2C%20moreover%2C%20next%2C%20too.

connectives = [ 'also', 'besides', 'equally', 'further', 'furthermore', 'in addition', 'moreover', 'next', 'too', 'also', 'likewise', 'moreover', \
'however', 'on the contrary', 'on the other hand', 'in contrast', 'nevertheless', 'for example', 'for instance', 'in fact', \
'finally', 'in brief', 'in conclusion', 'in other words', 'in short', 'in summary', 'therefore', 'accordingly', 'as a result', \
'consequently', 'for this reason', 'therefore', 'afterward', 'in the meantime', 'later', 'meanwhile', 'next', 'second', 'earlier', \
'finally', 'first', 'soon', 'still', 'then', 'third' ]

def vectorizeEXAM(x):
    if x == 'STEP 1' : 
        return .33
    elif x == 'STEP 2' : 
        return .66
    else: 
        return 1.0

In [15]:
def text_feature( data ):
    data = data.astype('str')
    num_of_words = []
    num_of_uniq_words = []
    num_of_additives = []
    num_of_uniq_additives = []
    num_of_normalized_additives = []
    num_of_numbers = []
    num_of_letters = []
    
    for index, row in tqdm(data.iterrows()):
        text = row['ItemStem_Text']
        for option in [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J' ] : 
            if row[ 'Answer__' + option ] != 0 : 
                text += ' ' + row[ 'Answer__' + option ]
                
        text = text.lower()

        num_of_words.append(len(text.split()))
        num_of_uniq_words.append(len(set(text.split()))) 
        num_of_additives.append(sum([ text.count(i) for i in connectives ]))
        num_of_uniq_additives.append(sum([ 1 if text.count(i) > 0 else 0 for i in connectives ]))
        num_of_normalized_additives.append(sum([ text.count(i) for i in connectives ])/num_of_words[-1])
        num_of_numbers.append(sum(c.isdigit() for c in text))
        num_of_letters.append(sum(c.isalpha() for c in text))

    df['ItemType_num'] = df.ItemType.apply(lambda x: 0 if x == 'PIX' else 1 ) 
    df['EXAM_num'] = df.EXAM.apply( vectorizeEXAM )
    
    featurized = pd.DataFrame( {
                  'num_of_words' : num_of_words,
                  'num_of_uniq_words' : num_of_uniq_words,
                  'num_of_additives' : num_of_additives,
                  'num_of_uniq_additives' : num_of_uniq_additives,
                  'num_of_normalized_additives' : num_of_normalized_additives, 
                  'num_of_numbers' : num_of_numbers,
                  'num_of_letters' : num_of_letters}
                             )
    
    featurized['ItemType_num'] = data.ItemType.apply(lambda x: 0 if x == 'PIX' else 1 ) 
    featurized['EXAM_num'] = data.EXAM.apply( vectorizeEXAM )
    
    return featurized

featurized_data = text_feature( df )
featurized_test_data = text_feature( test_data )

466it [00:00, 2073.08it/s]
201it [00:00, 2262.88it/s]


In [16]:
featurized_data.head(3)

Unnamed: 0,num_of_words,num_of_uniq_words,num_of_additives,num_of_uniq_additives,num_of_normalized_additives,num_of_numbers,num_of_letters,ItemType_num,EXAM_num
0,119,79,0,0,0.0,13,606,1,0.33
1,108,75,2,1,0.018519,26,617,1,0.33
2,124,84,2,2,0.016129,18,561,1,0.66


In [17]:
X_cols = featurized_data.columns.values
y_1 = df['Difficulty']  
y_2 = df['Response_Time'] 

In [18]:
#X_train1, X_test1, y_train1, y_test1 = train_test_split( featurized_data[X_cols ], y_1, test_size=0.1, random_state=1 )
#X_train2, X_test2, y_train2, y_test2 = train_test_split( featurized_data[X_cols ], y_2, test_size=0.1, random_state=1 )

#X_train1, X_val1, y_train1, y_val1 = train_test_split( df[X_cols], y_1, test_size=0.1, random_state=1 )
#X_train2, X_val2, y_train2, y_val2 = train_test_split( df[X_cols], y_2, test_size=0.1, random_state=1 )

X_train, X_test = featurized_data[X_cols ], featurized_test_data[X_cols ]
y_train1, y_train2, y_test1, y_test2 = y_1, y_2, test_data['Difficulty'], test_data['Response_Time']
#X_test1 = test_data[X_cols]
#X_test2 = test_data[X_cols]

### Task 1: Predicting Difficulty

In [8]:
%%time

rfr = RandomForestRegressor(random_state=0)
param_dict = { 'n_estimators': [ 700, 900], "max_depth": [3,5] }
model_rfr = GridSearchCV( rfr ,param_grid=param_dict, cv=5, refit = True)
model_rfr.fit(X_train,y_train1)
print(model_rfr.best_params_)

y_predicted = model_rfr.predict(X_test)
rfr_rmse = mean_squared_error( y_test1, y_predicted, squared=False ) 
print(rfr_rmse)

{'max_depth': 3, 'n_estimators': 700}
0.30395751597905185
CPU times: total: 1min 30s
Wall time: 1min 28s


### Task 2: Predicting Response Time

In [11]:
%%time

rfr = RandomForestRegressor(random_state=0)
param_dict = { 'n_estimators': [800, 900], "max_depth": [3,5,6] }
model_rfr = GridSearchCV( rfr ,param_grid=param_dict, cv=5, refit = True)
model_rfr.fit(X_train,y_train2)
print(model_rfr.best_params_)

y_predicted = model_rfr.predict(X_test)
rfr_rmse = mean_squared_error( y_test2, y_predicted, squared=False ) 
print(rfr_rmse)

{'max_depth': 6, 'n_estimators': 800}
26.234097362989363
CPU times: total: 1min 26s
Wall time: 1min 29s


In [14]:
df.shape

(466, 18)

In [18]:
df.columns

Index(['ItemNum', 'ItemStem_Text', 'Answer__A', 'Answer__B', 'Answer__C',
       'Answer__D', 'Answer__E', 'Answer__F', 'Answer__G', 'Answer__H',
       'Answer__I', 'Answer__J', 'Answer_Key', 'Answer_Text', 'ItemType',
       'EXAM', 'Difficulty', 'Response_Time'],
      dtype='object')

In [16]:
sample_test = df.sample(50)

In [20]:
sample_train = df[ ~df['ItemNum'].isin(sample_test['ItemNum'])] 

In [22]:
sample_train.shape

(416, 18)

In [23]:
sample_train.to_csv('sample_train.csv', index=None)
sample_test.to_csv('sample_test.csv', index=None)

In [27]:
df = pd.read_excel( 'train_final.xlsx' )
test = pd.read_excel( 'test_final.xlsx')
featurized_data = text_feature( df )
test_data = text_feature( test ) 

466it [00:00, 1978.45it/s]
201it [00:00, 2252.22it/s]


In [28]:
X_cols = featurized_data.columns.values

In [29]:
X_cols

array(['num_of_words', 'num_of_uniq_words', 'num_of_additives',
       'num_of_uniq_additives', 'num_of_normalized_additives',
       'num_of_numbers', 'num_of_letters', 'ItemType_num', 'EXAM_num'],
      dtype=object)

In [30]:
df.columns

Index(['ItemNum', 'ItemStem_Text', 'Answer__A', 'Answer__B', 'Answer__C',
       'Answer__D', 'Answer__E', 'Answer__F', 'Answer__G', 'Answer__H',
       'Answer__I', 'Answer__J', 'Answer_Key', 'Answer_Text', 'ItemType',
       'EXAM', 'Difficulty', 'Response_Time', 'ItemType_num', 'EXAM_num'],
      dtype='object')

In [31]:
df['Response_Time']

0      111.21
1       83.94
2       87.82
3       54.87
4       69.11
        ...  
461     77.18
462    129.56
463     78.03
464     79.69
465     96.63
Name: Response_Time, Length: 466, dtype: float64

In [32]:
featurized_data
rfr = RandomForestRegressor(random_state=0, max_depth= 5, n_estimators= 900)
rfr.fit( featurized_data,  df['Response_Time'] )

In [33]:
predicted_time = rfr.predict( test_data ) 

In [36]:
test['ItemNum']

0      552
1       16
2      441
3      219
4      600
      ... 
196    612
197    315
198    509
199    550
200    290
Name: ItemNum, Length: 201, dtype: int64

In [37]:
ans_df = pd.DataFrame( { 'ItemNum' : test['ItemNum'], 'Prediction' : predicted_time } )

In [38]:
ans_df.to_csv( 'Rishikesh_Fulari_Response_Time_Predictions_run_1.csv', index=None )

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

def find_best_classical_model(X_train,X_test, y_train, y_test):
    
    # Model 1 : Linear Regression 
    model_1_lr = LinearRegression()
    model_1_lr.fit(X_train, y_train)
    #y_predicted_1 = model_1_lr.predict(X_valid)
    #model_1_rmse = mean_squared_error( y_valid, y_predicted_1, squared=False ) 
    predicted_lr = model_1_lr.predict(X_test)
    test_rmse_lr = mean_squared_error( y_test, predicted_lr, squared=False )
    
    # Model 2 : Decision Tree classifier
    dtr = DecisionTreeRegressor(random_state=0)
    param_dict_2 = { "max_depth": [3,5] }
    model_2_dtr = GridSearchCV( dtr ,param_grid=param_dict_2, cv=3, refit = True)
    model_2_dtr.fit(X_train,y_train)
    #y_predicted_2 = model_2_dtr.predict(X_valid)
    #model_2_rmse = mean_squared_error( y_valid, y_predicted_2, squared=False ) 
    predicted_dt = model_2_dtr.predict(X_test)
    test_rmse_dt = mean_squared_error( y_test, predicted_dt, squared=False )

    # Model 3 : KNN
    knn = KNeighborsRegressor(weights="distance", metric= "minkowski", p = 2)
    param_dict_3 = { 'n_neighbors' : [3,5, 7] }
    model_3_knn = GridSearchCV(knn,param_grid=param_dict_3, cv=3 )
    model_3_knn.fit(X_train,y_train)
    #y_predicted_3 = model_3_knn.predict(X_valid)
    #model_3_rmse = mean_squared_error( y_valid, y_predicted_3, squared=False ) 
    predicted_knn = model_3_knn.predict(X_test)
    test_rmse_knn = mean_squared_error( y_test, predicted_knn, squared=False )

    # Model 4 : XGBoost
    regressor=xgb.XGBRegressor(eval_metric='rmse')
    param_dict_4 = {"max_depth": [3, 5], "n_estimators": [600, 700]}
    search = GridSearchCV(regressor, param_dict_4, cv=5).fit(X_train, y_train)
    regressor=xgb.XGBRegressor(n_estimators  = search.best_params_["n_estimators"],
                               max_depth     = search.best_params_["max_depth"],
                               eval_metric='rmse')
    regressor.fit(X_train, y_train)
    #y_predicted_4 = regressor.predict(X_valid)
    #model_4_rmse = mean_squared_error( y_valid, y_predicted_4, squared=False )
    predicted_xgb = regressor.predict(X_test)
    test_rmse_xgb = mean_squared_error( y_test, predicted_xgb, squared=False )

    # Model 5 : GBDT
    gbdt_regressor = GradientBoostingRegressor(random_state=0)
    param_dict_5 =  {"max_depth": [3,5], "n_estimators": [600, 700]}
    model_5_gbdt = GridSearchCV(gbdt_regressor, param_grid=param_dict_5, cv=3, refit=True )
    model_5_gbdt.fit(X_train,y_train)
    #y_predicted_5 = model_5_gbdt.predict(X_valid)
    #model_5_rmse = mean_squared_error( y_valid, y_predicted_5, squared=False )
    predicted_gbdt = model_5_gbdt.predict(X_test)
    test_rmse_dt = mean_squared_error( y_test, predicted_gbdt, squared=False )

    model_names = ["Linear Regression","Decision Trees","KNN","XGBoost","GBDT"]
    #rmse_scores = [ model_1_rmse, model_2_rmse, model_3_rmse, model_4_rmse, model_5_rmse ]
    test_rmse = [ test_rmse_lr, test_rmse_dt, test_rmse_knn, test_rmse_xgb, test_rmse_dt ]
    
    return model_names, test_rmse

model_names, test_rmse_task1 = find_best_classical_model(X_train, X_test, y_train1, y_test1)   
model_names, test_rmse_task2 = find_best_classical_model(X_train, X_test, y_train2, y_test2)   

In [20]:
print(model_names)
print(test_rmse_task1)
print(test_rmse_task2)

['Linear Regression', 'Decision Trees', 'KNN', 'XGBoost', 'GBDT']
[0.3028678584809251, 0.34825993895558943, 0.324088535092072, 0.3536270837096936, 0.34825993895558943]
[26.181892002473468, 28.862525636492872, 29.57465522392491, 28.64420986158485, 28.862525636492872]


In [21]:
X_train.shape

(466, 9)

In [None]:
9