# **NATURAL LANGUAGE PROCESSING WITH DISASTER TWEETS**

![](https://miro.medium.com/max/1476/0*z9jqZsQ7JSTZGSZz.jpg)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import re

import warnings 
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfTransformer
from IPython.display import Markdown, displa

In [None]:
train_df=pd.read_csv('../input/nlp-getting-started/train.csv')
test_df=pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
test_df.columns

In [None]:
test_df.head()

In [None]:
test_df.isnull().sum()

In [None]:
train_df['keyword'].value_counts()

In [None]:
sns.barplot(y=train_df['keyword'].value_counts()[:25].index,x=train_df['keyword'].value_counts()[:25],orient='horizontal',palette='rocket')

In [None]:
sns.barplot(y=train_df['location'].value_counts()[:25].index,x=train_df['location'].value_counts()[:25],orient='horizontal',palette='viridis')

In [None]:
train_df['location'].value_counts()

In [None]:
train_df.isnull().sum()

In [None]:
#1. Function to replace NAN values with mode value
def impute_nan_most_frequent_category(DataFrame,ColName):
    # .mode()[0] - gives first category name
     most_frequent_category=DataFrame[ColName].mode()[0]
    
    # replace nan values with most occured category
     DataFrame[ColName + "_Imputed"] = DataFrame[ColName]
     DataFrame[ColName + "_Imputed"].fillna(most_frequent_category,inplace=True)
#2. Call function to impute most occured category
for Columns in ['keyword','location']:
    impute_nan_most_frequent_category(train_df,Columns)
    
# Display imputed result
train_df[['keyword','keyword_Imputed','location','location_Imputed']].head(10)
#3. Drop actual columns
train_df = train_df.drop(['keyword','location'], axis = 1)

In [None]:
train_df.isnull().sum()

In [None]:
# removing the null values from text dataset as well
for Columns in ['keyword','location']:
    impute_nan_most_frequent_category(test_df,Columns)
    
# Display imputed result
test_df[['keyword','keyword_Imputed','location','location_Imputed']].head(10)
#3. Drop actual columns
test_df=test_df.drop(['keyword','location'], axis = 1)

In [None]:
test_df.isnull().sum()

In [None]:
train_df.info()

In [None]:
train_df['target'].value_counts()

In [None]:
train_df.groupby('target').describe()

In [None]:
# Let's get the length of the tweets
train_df['length']=train_df['text'].apply(len)
train_df

In [None]:
test_df['length']=train_df['text'].apply(len)
test_df

In [None]:
train_df['text'].values[:60]

In [None]:
train_df['length'].plot(bins=100, kind='hist',color='magenta')

In [None]:
test_df['length'].plot(bins=100,kind='hist',color='blue')

In [None]:
#separating the tweets according to the target value given
nodis=train_df[train_df['target']==0]
nodis

In [None]:
nodis_tweets=nodis['text']
nodis_tweets.values[:30]

In [None]:
nodis.shape

In [None]:
dis=train_df[train_df['target']==1]
dis

In [None]:
dis_tweets=dis['text']
dis_tweets.values[:30]

**we can observe that most of the headlines containes url in the so we will deal with them in upcoming blocks**

In [None]:
#let's calculate the percentage of disaster and no disater
print('no disater percentage',(len(nodis)/len(train_df))*100,'%')

In [None]:
#let's calculate the percentage of disaster 
print('disater percentage',(len(dis)/len(train_df))*100,'%')

We can observe that large amount of percentage is there for tweets which indicates no information about the disaster

In [None]:
sns.countplot(train_df['target'],label='no disater tweets vs disaste tweets')

In [None]:
plt.style.use('ggplot')
f,axes=plt.subplots(1,2,figsize=(10,5))
f.suptitle('Characters in tweets')

tweet_len=train_df[train_df['target']==1]['text'].str.len()
ax1=sns.histplot(tweet_len,ax=axes[0],color='red')
# ax1.set(xlabel='common xlabel', ylabel='common ylabel')
ax1.set_title('disaster tweets')

tweet_len=train_df[train_df['target']==0]['text'].str.len()
ax2=sns.histplot(tweet_len,ax=axes[1])
# ax2.set(xlabel='common xlabel', ylabel='common ylabel')
ax2.set_title('Not disaster tweets')

Lets remove punctuation and stopwords from the text given

In [None]:
# let us first visualize the count of punctuations and stopwords

In [None]:
text=''
for i in train_df['text']:
    text+=i

In [None]:
from collections import defaultdict,Counter
count_punctuation=defaultdict(int)
import string

for x,y in Counter(text).items():

    if x in string.punctuation:
        count_punctuation[x]=y
print(count_punctuation)

In [None]:
# sns.set(rc={'figure.figsize':(10,5)})
# ax=sns.barplot(x='punctuation',y='count',
#             data=pd.DataFrame(count_punctuation.items()
#                               ,columns=['punctuation','count']).sort_values(by=['count'], ascending=False))
# ax.set_title('Count of Punctuations')

In [None]:
import string
string.punctuation


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

No lets apply both of these concepts to our text column

In [None]:
def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    Test_punc_removed_join_clean_join=' '.join(Test_punc_removed_join_clean) 
    return Test_punc_removed_join_clean_join

In [None]:
train_df['text']=train_df['text'].apply(message_cleaning)

In [None]:
train_df['text']=train_df['text'].str.lower()

In [None]:
print(train_df['text'])

In [None]:
test_df['text']=test_df['text'].apply(message_cleaning)

In [None]:
test_df['text']=test_df['text'].str.lower()

In [None]:
train_df['text'].str.contains('http?').sum()

In [None]:
train_df['text'].str.contains('http').sum()

In [None]:
review=train_df['text'][31]

In [None]:
review

In [None]:
pd.set_option('display.max_rows',50)
# to show all the 2000 rows otherwise it will display with the gap
train_df['text'].str.contains('http',regex=True)[:50]

In [None]:
import re
def remove_urls(review):
    url_pattern=re.compile(r'href|http.\w+')
    return url_pattern.sub(r'', review)
# substitute with space

lets remove emojis and numbers as well

In [None]:
def removeNumbers(text):
    clean_text = re.sub(r'\d+', '', text)
    return clean_text


In [None]:
def removeEmojis(text):
    allchars = [c for c in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["en"]]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

In [None]:
train_df['text']=train_df['text'].apply(remove_urls)

In [None]:
test_df['text']=test_df['text'].apply(remove_urls)

In [None]:
train_df['text']=train_df['text'].apply(removeNumbers)

In [None]:
!pip install emoji
import emoji
train_df['text']=train_df['text'].apply(removeEmojis)

In [None]:
train_df['text'][31]
# lets again observe that there are any urls in the text or not

In [None]:
train_df['text'].str.contains('http').sum()

In [None]:
#separating the tweets according to the target value given
nodis=train_df[train_df['target']==0]
nodis

In [None]:
nodis_tweets=nodis['text']
nodis_tweets.values[:30]

In [None]:
nodis.shape

In [None]:
dis=train_df[train_df['target']==1]
dis

In [None]:
dis_tweets=dis['text']
dis_tweets.values[:30]

Generating a wordcloud for disaster tweets and non-disaster tweets

In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[26, 8])
wordcloud1 = WordCloud( background_color='black',
                        width=600,
                        height=400).generate(" ".join(dis_tweets))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='black',
                        width=600,
                        height=400).generate(" ".join(nodis_tweets))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=40);

we can observe that in disaster tweets words like suicide,bomber,fire ,hiroshima appears more frequently
And one can easily identify looking the word cloud of non diasater tweetes,what are the frequent words that happen to arrive

**LET'S PERFORM TEXT NORMALIZATION**

Text normalization is the process of transforming text into a single canonical form that it might not have had before.For example, the word “gooood” and “gud” can be transformed to “good”, its canonical form.ANTINATIONALIST can be transformed to nationalist.

Lets convert all the abbreviations to its full form. Thanks to https://www.kaggle.com/rftexas/text-only-bert-keras?scriptVersionId=31186559

In [None]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [None]:

def convert_abrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

In [None]:
train_df['text']=train_df['text'].apply(convert_abrev)
test_df['text']=test_df['text'].apply(convert_abrev)

In [None]:
train_df['text']

**LETS DO STEMMING ON OUR DATA**

Stemming is a elementary rule-based process for removing inflationary forms from a given token.The output of the error is the stem of a word. for example laughing, laughed, laughs, laugh all will become laugh after the stemming process.
Another example is studies,will be converted to studi,basically it tries to remove the suffix from the word

In [None]:
train_df['text'][20]

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer(language='english')
tokens = train_df['text'][20].split()
clean_text = ' '

for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

**lets define a common function that could be applied on full text**

In [None]:
def stemWord(text):
    stemmer = SnowballStemmer(language='english')
    tokens = text.split()
    clean_text = ' '
    for token in tokens:
        clean_text = clean_text+" "+stemmer.stem(token)      
    
    return clean_text

print("Text before stemWord function: " + train_df['text'][32])
print("Text after stemWord function: " + stemWord(train_df['text'][32]))

In [None]:
train_df['text']=train_df['text'].apply(stemWord)
test_df['text']=test_df['text'].apply(stemWord)

In [None]:
for txt in train_df['text'][:20]:
  print(txt);

**LETS APPLY LEMMATIZATION TO OUR TEXT COLUMN NOW**

Lemmatization is a systematic process of removing the inflectional form of a token and transform it into a lemma. It makes use of word structure, vocabulary, part of speech tags, and grammar relations.

The output of lemmatization is a root word called a lemma. for example “am”, “are”, “is” will be converted to “be”. Similarly, running runs, ‘ran’ will be replaced by ‘run’.


In [None]:
!pip install spacy
import spacy.cli
# nlp = spacy.load('en_core_web_lg')

In [None]:
nlp = spacy.cli.download('en_core_web_lg')


In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
doc = nlp("she will be performing all of her assigned responsibilities ")
#for token in doc:
   # print(token.lemma_)
for noun in doc.noun_chunks:
    print(noun.text)

In [None]:
for word in doc:
  print(word.text,word.lemma_)


In [None]:
def lemmatizeWord(text):
  tokens=nlp(text)
  clean_text=' '
  for token in tokens:
    clean_text=clean_text+" "+token.lemma_
  return clean_text
print("Text before lemmatizeWord function: " + train_df['text'][32])
print("Text after lemmatizeWord function: " + lemmatizeWord(train_df['text'][32]))

doc = "Atharva will be performing all of the assigned responsibilities"
lemmatizeWord(doc) 

In [None]:
train_df['text']=train_df['text'].apply(lemmatizeWord)
test_df['text']=test_df['text'].apply(lemmatizeWord)

Transforming into vector form 

**Bag of words model**

A bag-of-words (B.o.w) is a representation of text that describes the occurrence of words within a document. It involves two things:

A vocabulary of known words. A measure of the presence of known words. It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
bag_train=cv.fit_transform(train_df['text'])
bag_test=cv.fit_transform(test_df['text'])

**TFIDF Features** 

Another common representation is TF-IDF (Term Frequency - Inverse Document Frequency). TF-IDF is similar to bag of words except that each term count is scaled by the term's frequency in the corpus.
Using TF-IDF can potentially improve your models.

Term Frequency: is a scoring of the frequency of the word in the current document.

TF = (Number of times term t appears in a document)/(Number of terms in the document)

Inverse Document Frequency: is a scoring of how rare the word is across documents.

IDF = 1+log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
tfidf_train=tfidf.fit_transform(train_df['text'])
tfidf_test=tfidf.fit_transform(test_df['text'])

Word Vectors/Word Embeddings

A word embedding is a learned representation for text where words that have the same meaning have a similar representation. It is this approach to representing words and documents that may be considered one of the key breakthroughs of deep learning on challenging natural language processing problems.

In [None]:
with nlp.disable_pipes():
  train_vectors = np.array([nlp(text).vector for text in train_df.text])
  test_vectors = np.array([nlp(text).vector for text in train_df.text])

In [None]:
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

#now lets perform predictions by testing out diffferent models

In [None]:
svc_wordEmbed=LinearSVC(random_state=1,dual=False,max_iter=10000)
svc_wordEmbed.fit(train_vectors,train_df['target'])

In [None]:
scores = model_selection.cross_val_score(svc_wordEmbed, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
y_predicted_train1=svc_wordEmbed.predict(train_vectors)
y_predicted_train1

In [None]:
from sklearn import metrics
r2_Score= metrics.r2_score(train_df['target'],y_predicted_train1)
print("r2 score is:{}".format(r2_Score))

In [None]:
cm=confusion_matrix(train_df['target'],y_predicted_train1)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(train_df['target'],y_predicted_train1))

In [None]:
# seeing on the vectors through countvectorizer


In [None]:
svc_cv=LinearSVC(random_state=1,dual=False,max_iter=10000)
svc_cv.fit(bag_train,train_df['target'])

In [None]:
scores = model_selection.cross_val_score(svc_cv,bag_train, train_df["target"], cv=3, scoring="f1")
scores

In [None]:
y_predicted_train2=svc_cv.predict(bag_train)
y_predicted_train2

In [None]:
from sklearn import metrics
r2_Score= metrics.r2_score(train_df['target'],y_predicted_train2)
print("r2 score is:{}".format(r2_Score))

In [None]:
cm=confusion_matrix(train_df['target'],y_predicted_train2)
sns.heatmap(cm,annot=True)

In [None]:
svc_tfidf=LinearSVC(random_state=1,dual=False,max_iter=10000)
svc_tfidf.fit(tfidf_train,train_df['target'])

In [None]:
y_predicted_train=svc_tfidf.predict(tfidf_train)
y_predicted_train

In [None]:
cm=confusion_matrix(train_df['target'],y_predicted_train)
sns.heatmap(cm,annot=True)

#now lets give a try on XGBOOST

In [None]:
# first lets see for xgb word embed
from xgboost import XGBClassifier

In [None]:
xgb_wordEmbed = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

In [None]:
xgb_wordEmbed.fit(train_vectors,train_df['target'])

In [None]:
scores = model_selection.cross_val_score(xgb_wordEmbed, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

In [None]:
y_predicted_train3=xgb_wordEmbed.predict(train_vectors)
y_predicted_train3

In [None]:
from sklearn import metrics
r2_Score= metrics.r2_score(train_df['target'],y_predicted_train3)
print("r2 score is:{}".format(r2_Score))

In [None]:
cm=confusion_matrix(train_df['target'],y_predicted_train3)
sns.heatmap(cm,annot=True)

In [None]:
clf_xgb_TFIDF = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                    subsample=0.8, nthread=10, learning_rate=0.1)
clf_xgb_TFIDF.fit(tfidf_train,train_df['target'])
scores = model_selection.cross_val_score(clf_xgb_TFIDF, tfidf_train, train_df["target"], cv=3, scoring="f1")
scores

#now lets try naive bayes

In [None]:
# Fitting a simple Naive Bayes on Counts
clf_NB = MultinomialNB()
clf_NB.fit(bag_train,train_df["target"])
scores = model_selection.cross_val_score(clf_NB, bag_train,train_df["target"], cv=3, scoring="f1")
scores

In [None]:
y_train_prediction4=clf_NB.predict(bag_train)
y_train_prediction4

In [None]:
from sklearn import metrics
r2_Score= metrics.r2_score(train_df['target'],y_train_prediction4)
print("r2 score is:{}".format(r2_Score))

In [None]:
# lets check for tfidf
clf_NB_TFIDF = MultinomialNB()
clf_NB_TFIDF.fit(tfidf_train,train_df["target"])
scores = model_selection.cross_val_score(clf_NB_TFIDF,tfidf_train,train_df["target"], cv=3, scoring="f1")
scores

WE can observe from the above results that XGBOOST on wordembedding give bbetter results as compared to other models and the r2 score is also nearby 0.94

#preparing the submission file

In [None]:
import pandas as pd
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sample_submission.shape

In [None]:
y_predicted_train3=y_predicted_train3[:3263]

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sample_submission["target"] = y_predicted_train3.astype('int64')
sample_submission.to_csv("submission.csv", index=False)