In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
missing_values = ["na","n/a","-","NaN"] #dataset may contains null values in these forms
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv',na_values=missing_values)
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv',na_values=missing_values)


In [None]:
df_train.head()

In [None]:

def text_process(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    stopword = set(stopwords.words('english'))
    lem = WordNetLemmatizer()
    word_tokens = word_tokenize(tweet)
    word_tokens_temp = []
    for word in word_tokens:
        word = ''.join(i for i in word if not i.isdigit())
        word_tokens_temp.append(word)
    
    filtered_words = [lem.lemmatize(w) for w in word_tokens_temp if w not in stopword and w not in string.punctuation]
    new_sentence = ' '.join(filtered_words)
    return new_sentence

In [None]:
df_train['text']  = df_train['text'].apply(text_process)

# Word CLoud

In [None]:
stopwordSet = set(STOPWORDS)
tweet_words = ''
for tweet in df_train['text']:
    tweet = str(tweet)
    tokens = tweet.split()
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
        tweet_words += ' '.join(tokens) + ' '
        
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwordSet,
                min_font_size = 10).generate(tweet_words)
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
df_test.head()


In [None]:
df_test['text'] = df_test['text'].apply(text_process)

In [None]:
# def multiclass_logloss(actual, predicted, eps=1e-15):

#     # Convert 'actual' to a binary array if it's not already:
#     if len(actual.shape) == 1:
#         actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
#         for i, val in enumerate(actual):
#             actual2[i, val] = 1
#         actual = actual2

#     clip = np.clip(predicted, eps, 1 - eps)
#     rows = actual.shape[0]
#     vsota = np.sum(actual * np.log(clip))
#     return -1.0 / rows * vsota

# def pred(prediction):
#     predict = []
#     for i in prediction[:,1]:
#         if i >= 0.5:
#             predict.append(1)
#         else:
#             predict.append(0)
#     return predict

In [None]:
y = df_train['target']
x = df_train['text'] + ' ' + df_train['keyword']
xtrain, xvalid, ytrain, yvalid = train_test_split(x.values.astype(str), y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
print (xtrain.shape)
print (xvalid.shape)

# Building Basic Models
Let's start building our very first model.

Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression.

In [None]:
tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 1), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
xtest_tfv = tfv.transform(df_test['text'])

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0).fit(xtrain_tfv,ytrain)
#clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict(xvalid_tfv)

# print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print("Logistic Regression Score: ",clf.score(xvalid_tfv,yvalid))




In [None]:


output = pd.DataFrame({'id': df_test.id, 'target': clf.predict(xtest_tfv)})
output.to_csv('submissionLogReg.csv', index=False)

Instead of using TF-IDF, we can also use word counts as features. This can be done easily using CountVectorizer from scikit-learn.

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)
xtest_ctv = ctv.transform(df_test.text)

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)

print("Logistic Regression count Score: ",clf.score(xvalid_ctv,yvalid))

In [None]:

output = pd.DataFrame({'id':df_test.id,'target':clf.predict(xtest_ctv)})
output.to_csv('submissionLogRegCtv.csv',index=False)

In [None]:
# Fitting a simple Naive Bayes on TFIDF
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
print("Logistic Regression Score: ",clf.score(xvalid_ctv,yvalid))


In [None]:
output = pd.DataFrame({'id':df_test.id,'target':clf.predict(xtest_ctv)})
output.to_csv('submissionNaiveBayes.csv',index=False)

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.

svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_ctv)
xtrain_svd = svd.transform(xtrain_ctv)
xvalid_svd = svd.transform(xvalid_ctv)
xtest_svd = svd.transform(xtest_ctv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)
xtest_svd_scl = scl.transform(xtest_svd)

In [None]:
# Fitting a simple SVM

clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print("SVM score: ",clf.score(xvalid_svd_scl,yvalid))


In [None]:
output = pd.DataFrame({'id':df_test.id,'target':clf.predict(xtest_svd_scl)})
output.to_csv('submissionSVM.csv',index=False)

In [None]:

RF_clf = RandomForestClassifier().fit(xtrain_ctv,ytrain)

print("RandomForest score: ",RF_clf.score(xvalid_ctv,yvalid))


In [None]:
output = pd.DataFrame({'id':df_test.id,'target':RF_clf.predict(xtest_ctv)})
output.to_csv('submissionRandForest.csv',index=False)