In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import nltk
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn import feature_extraction, model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,f1_score

import re
import string

In [3]:
pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.6.3-py3-none-any.whl (2.7 MB)
     |████████████████████████████████| 2.7 MB 4.3 MB/s            
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.6.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
from spellchecker import SpellChecker

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
train_data=pd.read_csv('../input/nlp-getting-started/train.csv')
test_data=pd.read_csv('../input/nlp-getting-started/test.csv')

In [8]:
train_data.shape

(7613, 5)

In [9]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


#Text Preprocessing

In [10]:
#Upper case to Lower case
for df in [train_data,test_data]:
    df['text']=df['text'].str.lower()

In [11]:
#URL clean-up
def remove_urls(text):
    url=re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

for df in [train_data,test_data]:
    df['text']=df['text'].apply(remove_urls)


In [12]:
#HTML clean-up
def remove_html(text):
    html=re.compile('<.*?>')
    return html.sub(r'', text)

for df in [train_data,test_data]:
    df['text']=df['text'].apply(remove_html)

In [13]:
#Punctuation removal
def remove_punc(row):
    punc=string.punctuation
    return ''.join([char for char in row if char not in punc])

train_data['no_punc']=train_data['text'].apply(remove_punc)
test_data['no_punc']=test_data['text'].apply(remove_punc)

In [14]:
#Stopwords removal
def remove_stopwords(row):
    stop_words=stopwords.words('english')
    return ' '.join([word for word in row.split() if word not in stop_words])

for df in [train_data,test_data]:
    df['no-stopwords']=df['no_punc'].apply(remove_stopwords)

In [15]:
#Digits removal
def remove_digits():
    for df in [train_data,test_data]:
        df['remove_digits']=df['no-stopwords'].apply(lambda x:re.sub(r'[0-9]+', '', x))
        
remove_digits()

In [16]:
#Stemming
stemmer = PorterStemmer()
def stemming(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
for df in [train_data,test_data]:
    df["stemmed"] = df["remove_digits"].apply(stemming)

In [17]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
for df in [train_data,test_data]:
    df["lemmatized"] = df["stemmed"].apply(lemmatization)

In [18]:
#Spelling correction
spellchecker=SpellChecker()
def spell_correction(row):
    
    incorrect=spellchecker.unknown(row.split())
    wordlist=[]
    for word in row.split():
        if word in incorrect:
            correct_word=spellchecker.correction(word)
            wordlist.append(correct_word)
        else:
            wordlist.append(word)
    return ' '.join(wordlist)
    
for df in [train_data,test_data]:
    df['spell_check']=df['lemmatized'].apply(spell_correction)

In [19]:
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'no_punc',
       'no-stopwords', 'remove_digits', 'stemmed', 'lemmatized',
       'spell_check'],
      dtype='object')

In [20]:
X_data=train_data.drop(['id','text','target','no_punc','no-stopwords','remove_digits','stemmed','lemmatized'],axis=1)
y_data=train_data['target']

In [21]:
#Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3)

print(len(X_train), len(X_test), len(X_train)+len(X_test))

5329 2284 7613


In [22]:
#Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
train_vectors = cv.fit_transform(X_train["spell_check"])
test_vectors = cv.transform(X_test["spell_check"])
test_data_vectors=cv.transform(test_data['spell_check']) 

In [23]:
#Model Selection
pred=[]
for model in [LogisticRegression,SVC,XGBClassifier,RandomForestClassifier,MultinomialNB]:
    classifier=model().fit(train_vectors,y_train)
    prediction=classifier.predict(test_vectors)
    p=f1_score(y_test,prediction)
    pred.append(p)
pd.DataFrame({'Model':['LogisticRegression','SVC','XGBClassifier','RandomForestClassifier','MultinomialNB'],'F1-score':pred})





Unnamed: 0,Model,F1-score
0,LogisticRegression,0.750834
1,SVC,0.745422
2,XGBClassifier,0.729746
3,RandomForestClassifier,0.738728
4,MultinomialNB,0.7557


In [24]:
model=SVC()
model.fit(train_vectors,y_train)

SVC()

In [25]:
predictions = model.predict(test_vectors)

In [26]:
#Evaluation

from sklearn.metrics import classification_report, confusion_matrix, f1_score
print(f1_score(y_test,predictions))

0.7454223272297696


In [27]:
print(confusion_matrix(y_test,predictions))

[[1222  111]
 [ 320  631]]


In [28]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85      1333
           1       0.85      0.66      0.75       951

    accuracy                           0.81      2284
   macro avg       0.82      0.79      0.80      2284
weighted avg       0.82      0.81      0.81      2284



In [29]:
#Submission
pred_sub = model.predict(test_data_vectors)
Result=pd.DataFrame(data={'id':test_data['id'],'target':pred_sub})
Result.to_csv('Submission.csv',index=False)