## Imports

In [424]:
import re # for regular expressions
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk # for text manipulation
import warnings 
import re
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import xgboost as xgb

warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline
pd.set_option('display.max_colwidth', -1)



In [457]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [461]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Data Reading

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
def get_data(link,fileName):
  import pandas as pd
  id = link.split('/')[5]
  downloaded = drive.CreateFile({'id':id}) 
  downloaded.GetContentFile(fileName)  
  return pd.read_csv(fileName)

In [0]:
link_train = "https://drive.google.com/file/d/1GSYh3OB3gIcTaFTVpduUIiNWw9ydglJS/view?usp=sharing"
link_test = "https://drive.google.com/file/d/1JDwGzKXznSHXc6t9ur_Ci4EFeaWlgLF8/view?usp=sharing"
train_raw,test_raw = get_data(link_train,"train.csv"),get_data(link_test,"test.csv")

## Functions

In [0]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
                         
            continue
    if count != 0:
        vec /= count
    return vec

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

def lookup_dict(text, dictionary):
    for word in text.split():
        if word.lower() in dictionary:
            if word.lower() in text.split():
                text = text.replace(word, dictionary[word.lower()])
    return text

## Data Pre-Processing

In [0]:
train,test = train_raw.copy(),test_raw.copy()

In [430]:
train.shape,test.shape

((3235, 6), (1387, 5))

In [431]:
train.columns

Index(['id', 'original_text', 'lang', 'retweet_count', 'original_author',
       'sentiment_class'],
      dtype='object')

In [0]:
#  Replacing classes -1,0,1 with 0,1,2 because xgb.cv giving error as label should be in [0.num_class]
replace_sentiment_class = {"sentiment_class": {0: 0, 1.0: 1,-1.0:2}}
train.replace(replace_sentiment_class, inplace=True)

In [0]:
inv_replace_sentiment_class = {"sentiment_class":{v: k for k, v in replace_sentiment_class['sentiment_class'].items()}}

In [0]:
data = train.append(test, ignore_index=True)

In [437]:
data.shape

(4622, 6)

In [0]:
from html.parser import HTMLParser
html_parser = HTMLParser()

In [0]:
data['text_cleaned'] = data['original_text'].apply(lambda x: html_parser.unescape(x))

In [0]:
data['text_cleaned'] = data['text_cleaned'].str.replace('http\S+|www.\S+|pic.twitter.com\S+|instagram.com\S+|igshid\S+|buff.ly\S+|sallycleggart.co.uk/index.php/sall ys-blog/\S+|pic.twitter.com/1BF4y0wKB8\S+', '', case=False)

In [0]:
data['text_cleaned'] = np.vectorize(remove_pattern)(data['text_cleaned'], "@[\w]*")

In [0]:
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: x.lower())

In [0]:
# Apostrophe Dictionary
apostrophe_dict = {
"ain't": "are not",
"aren't": "am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'd've": "I would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": " when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [0]:
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: lookup_dict(x,apostrophe_dict))

In [0]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [0]:
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: lookup_dict(x,short_word_dict))

In [0]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [0]:
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: lookup_dict(x,emoticon_dict))

In [0]:
# data['text_cleaned'].str.extractall(r'(\#\w+)')

def find_hashtags(x,length):
    import re
    tweet_regex = re.compile(r"#([^\s|#]+)")
    if length:
        return len(re.findall(tweet_regex, x))
    else:
        return  ','.join(re.findall(tweet_regex, x))

In [0]:
# Only for data exploration
# data['hastags'] = data['text_cleaned'].apply(lambda x: find_hashtags(x,False))
data['hastag_count'] = data['text_cleaned'].apply(lambda x: find_hashtags(x,True))

In [0]:
# len(set(data['hastags'].str.cat(sep=', ').split(',')))

In [0]:
# Removing # from the text
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))

In [0]:
# Removing characters other than text
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x))

In [0]:
# Collect words greater than 2 letters
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [0]:
data["text_cleaned"] = data["text_cleaned"].apply(lambda x: x.split()) # tokenizing

In [0]:
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
data['text_cleaned'] = data['text_cleaned'].apply(lambda x: remove_stopwords(x))

In [0]:
wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

data['text_cleaned'] = data['text_cleaned'].apply(lambda x: lemmatizer(x))

In [463]:
data.shape

(4622, 8)

In [0]:
text_cleaned = data['text_cleaned']

In [466]:
text_cleaned

0       [happy, mothersday, amazing, mother, know, hard, able, see, mother, today, protect, vulnerable, member, society, beatcoronavirus]                                                                                    
1       [happy, mother, day, mum, sorry, cannot, bring, mother, day, flower, cwtch, honestly, point, would, walk, hot, coal, able, bell, soon, love, lot, xxx, need, photo, photo, app, goo, vxblrsczd]                      
2       [happy, mother, day, mother, day, work, today, quiet, time, reflect, dog, walk, finish, jigsaw, garden, learn, guitar, chord, drunk, strawberry, gin, tonic, watch, lee, even, dvd, favourite, place, visit, isolate]
3       [happy, mother, day, beautiful, woman, royalty, soothes, mummy, jeremy, emerald, prayforroksie, ultimateloveng]                                                                                                      
4       [remembering, amazing, lady, made, late, grandmother, iris, mum, carol, great, grandmother, ethel, misse

## Word to vector Model

In [0]:
features_size = 200
model_w2v = gensim.models.Word2Vec(
            text_cleaned,
            size=features_size, # desired no. of features/independent variables 
            window=5, # context window size(neighbours)
            min_count=5,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 4, # no.of cores
            seed = 143)

In [497]:
model_w2v.train(text_cleaned, total_examples= len(data["text_cleaned"]), epochs=100)

(4610759, 8457600)

In [498]:
model_w2v.wv.most_similar(positive="mother")

  if np.issubdtype(vec.dtype, np.int):


[('day', 0.7025200128555298),
 ('mum', 0.656665027141571),
 ('happy', 0.6490563154220581),
 ('mothersday', 0.5588871240615845),
 ('love', 0.41164329648017883),
 ('bobrisky', 0.40111392736434937),
 ('camilla', 0.37090376019477844),
 ('today', 0.367448091506958),
 ('tatafonaija', 0.3665328025817871),
 ('reviveoldpost', 0.35983604192733765)]

In [499]:
wordvec_arrays = np.zeros((len(tokenized_tweet), features_size))

for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], features_size)
    
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(4622, 200)

In [500]:
wordvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,-0.000188,0.265358,0.095655,0.150964,-0.067654,0.027559,-0.045507,-0.082419,-0.07265,-0.156762,0.114636,0.051653,-0.044679,0.008788,-0.023185,0.128527,-0.188034,-0.19667,0.095668,0.034915,0.004656,0.119129,0.173998,0.118803,0.086838,0.037958,-0.013587,0.096281,0.067215,0.230139,-0.155782,0.006908,-0.122772,0.008636,-0.001822,-0.211675,0.424001,-0.069137,0.029966,0.211141,...,-0.003282,0.16354,0.008736,-0.080391,0.020987,0.158512,-0.148818,0.016138,0.208832,-0.080221,0.076942,-0.112528,-0.144819,0.290329,-0.129797,-0.074116,0.168436,0.279121,-0.009791,-0.060107,0.239897,0.0629,0.077178,0.032479,0.149311,0.020548,0.110727,0.066381,-0.393679,-0.048408,0.102754,-0.214408,-0.171333,-0.174121,0.271824,-0.022272,-0.22786,-0.255926,-0.142268,-0.228782
1,-0.068576,0.099872,-0.05422,-0.044929,0.008066,0.254436,-0.153402,-0.00483,-0.232372,0.016476,0.081268,0.314312,0.011499,0.110769,-0.040501,0.034419,0.009462,-0.186261,0.217043,-0.168138,0.015243,0.241571,-0.042231,0.006976,0.098822,-0.204806,-0.110935,0.050727,-0.166539,0.106907,-0.080297,0.157051,-0.285137,0.053747,0.184899,-0.123049,0.17287,-0.124131,0.215721,0.120648,...,-0.083742,0.255854,-0.165919,-0.132731,-0.070217,-0.120706,-0.105756,-0.132731,-0.105292,-0.061734,-0.002637,-0.066562,-0.012999,0.305665,-0.063237,-0.144535,-0.117166,0.163968,0.140935,-0.055822,0.149742,-0.06201,0.282495,-0.17388,0.091495,0.057668,-0.10081,0.212184,-0.224503,-0.218727,0.117443,-0.076498,0.105002,-0.110911,0.397828,-0.113759,-0.331654,-0.316685,-0.169942,-0.014133
2,-0.091884,0.323897,-0.224006,0.212561,-0.243019,0.176958,0.098883,0.060729,-0.104169,-0.013659,0.205683,0.076985,-0.030415,0.025898,-0.053967,-0.036048,-0.073272,-0.078049,0.211638,0.095958,0.037137,0.168789,-0.066877,0.202579,0.2473,-0.057246,-0.325279,0.133982,-0.049218,0.089198,0.087518,0.230857,-0.149411,-0.003105,0.256943,0.023511,0.452676,0.101934,0.25983,0.078151,...,-0.068865,0.117213,0.194604,-0.070019,-0.027576,-0.101005,-0.20134,0.023981,0.168415,0.106984,-0.007943,-0.294007,-0.217377,0.242109,-0.233862,-0.117197,-0.159625,0.227168,-0.095159,0.042489,0.152512,0.06446,0.03716,0.061508,0.138094,0.149146,0.139122,0.034133,-0.151333,-0.04037,0.122351,-0.023816,-0.230262,-0.202655,0.166478,0.074905,-0.100206,-0.24391,-0.298723,-0.103791
3,0.100441,0.025683,0.155914,-0.132472,-0.022873,0.14976,0.022686,0.075692,-0.271995,-0.151662,0.1761,0.086458,-0.054771,0.054315,-0.198975,-0.065989,-0.305952,0.020787,0.271063,-0.082442,0.170746,0.17619,-0.068469,0.177649,-0.109993,0.0793,0.001462,0.187179,-0.018747,-0.238667,-0.221854,0.054147,-0.330342,0.006412,0.16357,-0.006899,0.404212,-0.06748,0.000419,-0.003779,...,-0.144279,-0.033371,0.08788,-0.161384,-0.096133,0.046293,-0.327866,0.163323,0.2677,0.16225,-0.105279,0.036928,-0.14312,-0.093955,-0.121182,0.067829,-0.06301,0.203484,0.187032,-0.186332,0.110662,0.103904,0.05878,0.035012,-0.087056,0.014825,0.121014,-0.148551,-0.1574,0.019748,0.005678,-0.213801,-0.054261,-0.314911,0.134728,-0.182641,-0.275192,-0.216109,-0.276587,-0.099465
4,-0.058115,0.413648,0.062769,-0.384501,0.02661,0.283552,-0.129336,-0.102163,-0.063031,0.314546,0.092576,0.155325,-0.117348,-0.295103,-0.225303,0.023501,-0.103684,-0.071014,0.289105,-0.321197,0.104274,0.032117,0.113012,0.044281,0.148769,-0.061859,-0.215494,-0.108649,0.121444,-0.004906,-0.21379,0.191808,-0.248846,0.162335,0.030721,-0.022986,0.391599,-0.144603,-0.251644,-0.027346,...,0.079413,0.172465,-0.01406,-0.039219,0.127769,0.164355,-0.272927,-0.152296,0.204926,0.160745,0.039171,-0.06718,-0.149025,0.106377,-0.096253,0.120622,-0.400964,0.32099,0.190144,0.027369,0.198534,0.021731,0.221569,0.211143,0.066733,0.04495,-0.071001,0.027084,-0.220888,-0.178551,0.307481,-0.114006,-0.101576,-0.24504,0.250267,-0.151413,-0.156983,-0.144038,-0.248161,-0.143914


In [0]:
train_w2v = wordvec_df.iloc[:3235,:]
test_w2v = wordvec_df.iloc[3235:,:]

In [502]:
train_w2v.shape,test_w2v.shape

((3235, 200), (1387, 200))

In [0]:
# Splitting Train dataframe into train and validation
x_train, x_val, y_train, y_val = train_test_split(train_w2v,train['sentiment_class'],  
                                                          random_state=42, 
                                                          test_size=0.25)

In [504]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((2426, 200), (809, 200), (2426,), (809,))

## Model Building

In [0]:
# xgb = XGBClassifier(learning_rate =0.1,
#                     n_estimators=1000,
#                     max_depth=8,
#                     min_child_weight=6,
#                     gamma=1.2,
#                     subsample=0.9,
#                     colsample_bytree=0.6,
#                     scale_pos_weight=1,
#                     objective='multi:softprob',
#                     seed=27).fit(x_train, y_train)

#  Base Model given F1 Score of 0.4956736711990111

In [0]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(test_w2v)

In [507]:
set(dtrain.get_label())

{0.0, 1.0, 2.0}

In [0]:
params = {
    'objective':'multi:softmax',
    'booster': 'gbtree',
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    
}

In [0]:
def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = preds.astype(np.int)
    return [('f1_score', f1_score(labels, preds,average="micro"))]

In [0]:
# Tuning max_depth and min_child_weight
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,10) 
    for min_child_weight in range(5,8)
]

In [511]:
max_f1 = 0. # initializing with 0
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['num_class'] = len(np.unique(dtrain.get_label()))

    # Cross-validation
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10,
    )

    # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with max_depth=6, min_child_weight=5
	F1 Score 0.4711486 for 0 rounds
CV with max_depth=6, min_child_weight=6
	F1 Score 0.4670222 for 0 rounds
CV with max_depth=6, min_child_weight=7
	F1 Score 0.46826460000000003 for 0 rounds
CV with max_depth=7, min_child_weight=5
	F1 Score 0.45342080000000007 for 0 rounds
CV with max_depth=7, min_child_weight=6
	F1 Score 0.4542438 for 0 rounds
CV with max_depth=7, min_child_weight=7
	F1 Score 0.4587808 for 0 rounds
CV with max_depth=8, min_child_weight=5
	F1 Score 0.4406474 for 0 rounds
CV with max_depth=8, min_child_weight=6
	F1 Score 0.43776860000000006 for 0 rounds
CV with max_depth=8, min_child_weight=7
	F1 Score 0.44765440000000006 for 0 rounds
CV with max_depth=9, min_child_weight=5
	F1 Score 0.42992739999999996 for 0 rounds
CV with max_depth=9, min_child_weight=6
	F1 Score 0.42745920000000004 for 0 rounds
CV with max_depth=9, min_child_weight=7
	F1 Score 0.4274534 for 0 rounds
Best params: 6, 5, F1 Score: 0.4711486


In [0]:
params['max_depth'] = 6
params['min_child_weight'] = 7

In [0]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5,10)]
    for colsample in [i/10. for i in range(5,10)]
]

In [514]:
max_f1 = 0.
best_params = None
for subsample, colsample in gridsearch_params:
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    # Update our parameters
    params['colsample'] = colsample
    params['subsample'] = subsample

    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )

    # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample, colsample)

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with subsample=0.5, colsample=0.5
	F1 Score 0.4377642 for 0 rounds
CV with subsample=0.5, colsample=0.6
	F1 Score 0.4377642 for 0 rounds
CV with subsample=0.5, colsample=0.7
	F1 Score 0.4377642 for 0 rounds
CV with subsample=0.5, colsample=0.8
	F1 Score 0.4377642 for 0 rounds
CV with subsample=0.5, colsample=0.9
	F1 Score 0.4377642 for 0 rounds
CV with subsample=0.6, colsample=0.5
	F1 Score 0.454249 for 0 rounds
CV with subsample=0.6, colsample=0.6
	F1 Score 0.454249 for 0 rounds
CV with subsample=0.6, colsample=0.7
	F1 Score 0.454249 for 0 rounds
CV with subsample=0.6, colsample=0.8
	F1 Score 0.454249 for 0 rounds
CV with subsample=0.6, colsample=0.9
	F1 Score 0.454249 for 0 rounds
CV with subsample=0.7, colsample=0.5
	F1 Score 0.4542464 for 0 rounds
CV with subsample=0.7, colsample=0.6
	F1 Score 0.4542464 for 0 rounds
CV with subsample=0.7, colsample=0.7
	F1 Score 0.4542464 for 0 rounds
CV with subsample=0.7, colsample=0.8
	F1 Score 0.4542464 for 0 rounds
CV with subsample=0.7, co

In [0]:
params['subsample'] = .8
params['colsample_bytree'] = .5

In [516]:
max_f1 = 0.
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    # Update ETA
    params['eta'] = eta

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=1000,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=20
    )

    # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = eta

print("Best params: {}, F1 Score: {}".format(best_params, max_f1))


CV with eta=0.3
	F1 Score 0.4761004 for 0 rounds
CV with eta=0.2
	F1 Score 0.4761004 for 0 rounds
CV with eta=0.1
	F1 Score 0.4761004 for 0 rounds
CV with eta=0.05
	F1 Score 0.4761004 for 0 rounds
CV with eta=0.01
	F1 Score 0.4761004 for 0 rounds
CV with eta=0.005
	F1 Score 0.4761004 for 0 rounds
Best params: 0.3, F1 Score: 0.4761004


In [0]:
params['eta'] = .3

In [518]:
max_f1 = 0.
best_params = None
for gamma in range(0,15):
    print("CV with gamma={}".format(gamma/10.))

    # Update ETA
    params['gamma'] = gamma/10.

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )

    # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = gamma/10.

print("Best params: {}, F1 Score: {}".format(best_params, max_f1))

CV with gamma=0.0
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.1
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.2
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.3
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.4
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.5
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.6
	F1 Score 0.5004215999999999 for 1 rounds
CV with gamma=0.7
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.8
	F1 Score 0.4761004 for 0 rounds
CV with gamma=0.9
	F1 Score 0.4761004 for 0 rounds
CV with gamma=1.0
	F1 Score 0.47651279999999996 for 0 rounds
CV with gamma=1.1
	F1 Score 0.47651279999999996 for 0 rounds
CV with gamma=1.2
	F1 Score 0.47692499999999993 for 0 rounds
CV with gamma=1.3
	F1 Score 0.47692499999999993 for 0 rounds
CV with gamma=1.4
	F1 Score 0.47733659999999994 for 0 rounds
Best params: 0.6, F1 Score: 0.5004215999999999


In [0]:
params['gamma'] = 0.0

In [520]:
params

{'booster': 'gbtree',
 'colsample': 0.9,
 'colsample_bytree': 0.5,
 'eta': 0.3,
 'gamma': 0.0,
 'max_depth': 6,
 'min_child_weight': 7,
 'num_class': 3,
 'objective': 'multi:softmax',
 'subsample': 0.8}

In [521]:
xgb_model = xgb.train(
    params,
    dtrain,
    feval= custom_eval,
    num_boost_round= 1000,
    maximize=True,
    evals=[(dvalid, "Validation")],
    early_stopping_rounds=10
)

[0]	Validation-merror:0.520396	Validation-f1_score:0.479604
Multiple eval metrics have been passed: 'Validation-f1_score' will be used for early stopping.

Will train until Validation-f1_score hasn't improved in 10 rounds.
[1]	Validation-merror:0.514215	Validation-f1_score:0.485785
[2]	Validation-merror:0.505562	Validation-f1_score:0.494438
[3]	Validation-merror:0.511743	Validation-f1_score:0.488257
[4]	Validation-merror:0.514215	Validation-f1_score:0.485785
[5]	Validation-merror:0.524104	Validation-f1_score:0.475896
[6]	Validation-merror:0.526576	Validation-f1_score:0.473424
[7]	Validation-merror:0.52534	Validation-f1_score:0.47466
[8]	Validation-merror:0.524104	Validation-f1_score:0.475896
[9]	Validation-merror:0.53152	Validation-f1_score:0.46848
[10]	Validation-merror:0.536465	Validation-f1_score:0.463535
[11]	Validation-merror:0.532757	Validation-f1_score:0.467244
[12]	Validation-merror:0.537701	Validation-f1_score:0.462299
Stopping. Best iteration:
[2]	Validation-merror:0.505562	V

In [0]:
def print_predictions(fileName,clf):
  from google.colab import files
  test_pred = clf.predict(dtest)
  result = pd.DataFrame()
  result['id'] = test['id']
  result['sentiment_class'] = pd.Series(test_pred)
  result.replace(inv_replace_sentiment_class, inplace=True)
  print(np.unique(result['sentiment_class']))
  print("Sample Data from Result")
  print(result.head())
  result.to_csv(fileName + '.csv', index=False)
  files.download(fileName + '.csv')

In [524]:
print_predictions("predictions_w2v_xgb_tuned_f200",xgb_model)

[-1.  0.  1.]
Sample Data from Result
             id  sentiment_class
0  1.246628e+18  0.0            
1  1.245898e+18  0.0            
2  1.244717e+18  0.0            
3  1.245730e+18  0.0            
4  1.244636e+18  0.0            
