In [16]:
import pandas as pd
import re
import string
import numpy as np
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import spacy
from sklearn.naive_bayes import MultinomialNB
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
import en_core_web_sm
nlp = en_core_web_sm.load()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold

# **Importing the Dataset**

In [17]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [18]:
df_train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [19]:
df_train.shape

(7613, 5)

# Preprocessing the Textual Data

* First we convert all the words in lowercase.
* Then we remove the punctutation,special characters,extra whitespaces and URLs using the regex expressions

**You can find all the resources of regex on the following website [Regex Expression](https://regexr.com/)**

In [20]:
def text_cleaner(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text) # remove extra whitespace
    text = re.sub(r'https?://\S+', '', text) # remove URLs
    text = re.sub(r"#", "", text)
    return text

In [21]:
df_train['new'] = df_train['text'].apply(text_cleaner)

In [22]:
df_train['new_text']=df_train['new'].apply(lambda x:nlp(x))
df_train.head()
df_train['new_text']

0       (our, deeds, are, the, reason, of, this, earth...
1           (forest, fire, near, la, ronge, sask, canada)
2       (all, residents, asked, to, shelter, in, place...
3       (13000, people, receive, wildfires, evacuation...
4       (just, got, sent, this, photo, from, ruby, ala...
                              ...                        
7608    (two, giant, cranes, holding, a, bridge, colla...
7609    (ariaahrary, thetawniest, the, out, of, contro...
7610    (m194, 0104, utc5, km, s, of, volcano, hawaii,...
7611    (police, investigating, after, an, ebike, coll...
7612    (the, latest, more, homes, razed, by, northern...
Name: new_text, Length: 7613, dtype: object

In [23]:
def transform(text):
    
    doc=nlp(text,disable=['parser','ner'])
    
    lemmas=[token.lemma_ for token in doc]
    
    a_lemmas=[lemma for lemma in lemmas if lemma.isalpha()]
    
    return ' '.join(a_lemmas)

In [24]:
df_train['new_text']=df_train['new_text'].apply(transform)
df_train['new_text']

0       our deed be the reason of this earthquake may ...
1                   forest fire near la ronge sask canada
2       all resident ask to shelter in place be be not...
3       people receive wildfire evacuation order in ca...
4       just got send this photo from ruby alaska as s...
                              ...                        
7608    two giant crane hold a bridge collapse into ne...
7609    ariaahrary thetawniest the out of control wild...
7610                               km s of volcano hawaii
7611    police investigate after an ebike collide with...
7612    the late more home raze by northern california...
Name: new_text, Length: 7613, dtype: object

In [25]:
X=df_train['new_text']
Y=df_train['target']
del df_train['new']
df_train.shape

(7613, 6)

In [26]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=4,test_size=0.2,stratify=Y)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(stop_words='english')
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

# Model Testing

In [28]:
models = []
models.append(('Logistic Regression', LogisticRegression()))
# models.append(('Kernel SVM', SVC()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('Multinomial NB', MultinomialNB()))
# models.append(('Random Forest', RandomForestClassifier()))
# models.append(('Decision Tree Classifier', DecisionTreeClassifier()))
models

[('Logistic Regression', LogisticRegression())]

In [29]:
X_train.shape

(6090, 12676)

# KFold Cross Validation 

The k-fold cross-validation method is widely used for calculating how well a machine learning model performs on a validation dataset.Once a k-value is determined, we can use it to assess various models on the dataset. We may then contrast the pattern of the scores to the scores of an analysis of the same model under the ideal test scenario to see whether or not they are strongly correlated.

1. Randomly shuffle the complete dataset.
2. The algorithm then divides the dataset into k groups, i.e., k folds of data
3. For every distinct group:
4. Use the dataset as a holdout dataset to validate the model.
5. The rest of the groups' datasets are used to train the model.
6. Fit a model onto the training dataset, then assess it on the holdout or validation dataset.
7. Keep the evaluation result but throw away the model generated.
8. Using the results of the model evaluation scores, summarise the model's performance.

In [30]:
from sklearn import model_selection
acc_score=[]
auc_score=[]
pre_results=[]
f1_results=[]
names=[]
result_col=pd.DataFrame(columns=['Algorithm','accuracy','ROC','Precision','f1_score'])
result_col

i=0
for name,model in models:
    kfold=model_selection.KFold(n_splits=10)
    names.append(name)
    cv_acc_results = model_selection.cross_val_score(model, X_train, Y_train,cv=kfold,scoring="accuracy")
    cv_auc_results = model_selection.cross_val_score(model, X_train, Y_train,cv=kfold,scoring="roc_auc")
    cv_pre_results = model_selection.cross_val_score(model, X_train, Y_train,cv=kfold,scoring="precision")
    cv_f1_results = model_selection.cross_val_score(model, X_train, Y_train,cv=kfold,scoring="f1")
    acc_score.append(cv_acc_results)
    auc_score.append(auc_score)
    pre_results.append(cv_pre_results)
    f1_results.append(cv_f1_results)
    
    result_col.loc[i]=[name,round(cv_acc_results.mean()*100,2),
                           round(cv_auc_results.mean()*100,2),
                           round(cv_pre_results.mean()*100,2),
                           round(cv_f1_results.mean()*100,2)]
    i+=1
    
result_col.sort_values(by=['ROC'],ascending=False)

Unnamed: 0,Algorithm,accuracy,ROC,Precision,f1_score
0,Logistic Regression,79.82,86.05,85.56,73.05


# Hyperparameter Tuning

In [31]:
models_tuned = []

models_tuned.append(('Logistic Regression', LogisticRegression(solver='liblinear',C = 10, max_iter=500, random_state = 0)))
# models_tuned.append(('SVC', SVC(C=1, kernel = 'linear', gamma=1, random_state = 0)))
# models_tuned.append(('Kernel SVM', SVC(C=1, kernel = 'rbf', gamma=1, random_state = 0)))
# models_tuned.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean', p = 2,algorithm='brute')))
# models_tuned.append(('Multinomial NB', good_modelnb))
# models_tuned.append(('Decision Tree Classifier', DecisionTreeClassifier(criterion = 'gini', max_leaf_nodes=19, random_state = 0)))
# models_tuned.append(('Random Forest', RandomForestClassifier(max_leaf_nodes = 24,max_features = 25, 
#                                                            n_estimators = 50,criterion = 'entropy', random_state = 0)))

**Kfold Cross validation on test data**

In [32]:
acc_results_opt =[]
auc_results_opt =[]
pre_results_opt =[]
f1_results_opt =[]
names_opt = []

result_col_opt = ["Algorithm", "ROC AUC", "Accuracy", 'Precision', 'F1 Scores']
model_results_opt = pd.DataFrame(columns = result_col_opt)

i=0
# K- fold cross validation

for name, model in models_tuned:
    names_opt.append(name)
    kfold = model_selection.KFold(n_splits=10)
    
    cv_acc_results_opt = model_selection.cross_val_score(model, X_test, Y_test, 
                    cv = kfold, scoring="accuracy")
    cv_auc_results_opt = model_selection.cross_val_score(model, X_test, Y_test,
                    cv = kfold, scoring="roc_auc")
    cv_pre_results_opt = model_selection.cross_val_score(model, X_test, Y_test,
                    cv = kfold, scoring="precision")
    cv_f1_results_opt = model_selection.cross_val_score(model, X_test, Y_test,
                    cv = kfold, scoring="f1")
    acc_results_opt.append(cv_acc_results_opt)
    auc_results_opt.append(cv_auc_results_opt)
    pre_results_opt.append(cv_pre_results_opt)
    f1_results_opt.append(cv_f1_results_opt)
    model_results_opt.loc[i] = [name, 
                           round(cv_auc_results_opt.mean()*100,2),
                           round(cv_acc_results_opt.mean()*100,2),
                           round(cv_pre_results_opt.mean()*100,2),
                           round(cv_f1_results_opt.mean()*100,2)]
    i+=1
model_results_opt.sort_values(by = ['ROC AUC'], ascending=False)

Unnamed: 0,Algorithm,ROC AUC,Accuracy,Precision,F1 Scores
0,Logistic Regression,79.52,74.71,75.27,67.75


**Preprocessing the test data**

In [38]:
import re

def preprocess(text):
    # apply Spacy pipeline to the text
    doc = nlp(text)
    # lemmatize and remove stopwords from the text
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    # join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df_test['clean_text']=df_test['text'].apply(text_cleaner)
# df_test['clean_text'] = df_test['clean_text'].apply(lambda x: nlp(x))
df_test['final_text']=df_test['clean_text'].apply(preprocess)

In [39]:
eval=vectorizer.transform(df_test['final_text']).toarray()

In [40]:
C = 1 # algorithm name
kernel = 'rbf'# kernel type
gamma =  1 # kernel parameter
# Train and test model
good_modelsvm = svm.SVC(C=C, kernel=kernel, 
                                  gamma=gamma) # create model 
print(good_modelsvm) # display model parameters
good_modelsvm.fit(X_train,Y_train) # train model


SVC(C=1, gamma=1)


In [41]:
final_pred = good_modelsvm.predict(eval)
final_pred.shape

(3263,)

In [42]:
submission2 = df_test[['id']].reset_index(drop=True)
submission2['target'] = final_pred.astype('int64')

In [43]:
submission2.to_csv('submission.csv', index=False)