In [None]:
# importing necessary libraried
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [None]:
# importing nltk and download wordnet files
import nltk
nltk.download('wordnet')

In [None]:
# download data only run first time
import os
import requests, zipfile, io

os.getcwd()

url = requests.get('https://he-s3.s3.amazonaws.com/media/hackathon/predict-the-happiness/predict-the-happiness/f2c2f440-8-dataset_he.zip')
data = zipfile.ZipFile(io.BytesIO(url.content))
data.extractall()

In [None]:
# load data - train, test and pre-preprocessed data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

full_cleaned_data = pd.read_csv('alldata_cleaned.csv')

In [None]:
# join both test and train data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [None]:
# define function for stemming and lemmatizing the text
def cleanData(text, stemming = False, lemmatize=False):
    txt = str(text)
    txt = re.sub(r'-', r'', txt)
    txt = re.sub(r'[PRON]', r'', txt)
    
   
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])
        
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w) for w in txt.split()])

    return txt

In [None]:
# define CountVectorizer of 9000 features and nrgams = (1,2)
countvec = CountVectorizer(max_features = 9000, ngram_range=(1, 2))

In [None]:
# stemm and lemmatize the data
full_cleaned_data['Description'] = full_cleaned_data['Description'].map(lambda x: cleanData(x,  stemming = True, lemmatize=True))

In [None]:
# transform the text data using CountVectorizer
countvecdata = countvec.fit_transform(full_cleaned_data['Description'])

In [None]:
# label encode categorical features 
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    full_cleaned_data[x] = lbl.fit_transform(full_cleaned_data[x])

In [None]:
# convert to matrix
countvec_df = pd.DataFrame(countvecdata.todense()) 

In [None]:
# append column header
countvec_df.columns = ['col' + str(x) for x in countvec_df.columns]

In [None]:
# slice the data to train and test
countvec_df_train = countvec_df[:len(train)] 
countvec_df_test = countvec_df[len(train):]

In [None]:
# set train and test features with response
train_feats = full_cleaned_data[~pd.isnull(alldata.Is_Response)]
test_feats = full_cleaned_data[pd.isnull(alldata.Is_Response)]

In [None]:
# encoder response by 1 and 0
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

In [None]:
# concatinate the dataframes to get actual train and test sets
train_feats2 = pd.concat([train_feats[cols], countvec_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], countvec_df_test], axis=1)

In [None]:
# set target
target = train_feats['Is_Response']

In [None]:
# importing LightGBM
import lightgbm as lgb

In [None]:
# make training dataset
d_train = lgb.Dataset(train_feats2, label = target)

In [None]:
# defining parameters - tuned

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'n_estimators':100,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate':0.002,
    'num_leaves': 72,
    'feature_fraction': 0.2, 
    'bagging_fraction': 0.4, 
    'bagging_freq':1
}


In [None]:
# runing cv to get the best round
lgb_cv = lgb.cv(params, d_train, num_boost_round=25000, nfold=2, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=200)

In [None]:
print(min(lgb_cv['binary_logloss-mean']))

In [None]:
# get nround value which have the lowest binary_logloss
nround = lgb_cv['binary_logloss-mean'].index(np.min(lgb_cv['binary_logloss-mean']))
print(nround) 

In [None]:
# print minimum binary_logloss
print(np.min(lgb_cv['binary_logloss-mean']))

In [None]:
# train the model for nrounds
model = lgb.train(params, d_train, num_boost_round=nround)

In [None]:
# make predictions with the model
preds = model.predict(test_feats2)

In [None]:
# display the predictions
print(preds)

In [None]:
# defining revese encoding function and make submission file

def to_labels(x):
    if x > 0.55:  # cutoff - choosen based on accuracy
        return "happy"
    return "not_happy"

sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))
sub3 = sub3[['User_ID','Is_Response']]
sub3.to_csv('senti_best_submission.csv', index=False) 