In [1]:
# imports libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import re 
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

import category_encoders as ce 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve,auc 

import warnings
warnings.filterwarnings('ignore')


In [2]:
df_train=pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df_test=pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
print(df_train.shape)
print(df_test.shape)

(7613, 5)
(3263, 4)


In [5]:
df_train.drop(columns=['keyword','location'],inplace=True)
df_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
df_train.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [7]:
# preprocess data 
def preprocess_text(text):
    #remove URLs
    text=re.sub(r'http\S+', '', text)
    
    #Tokenization
    tokens=word_tokenize(text)
    
    #Lowercasing 
    tokens_lower=[token.lower() for token in tokens]
    
    #Removing punctuation 
    tokens_no_punct=[token for token in tokens_lower if token not in string.punctuation ]
    
    #Removing stopwords
    stop_words=set(stopwords.words('english'))
    tokens_no_stopwords=[token for token in tokens_no_punct if token not in stop_words]
    
    #stemming 
    porter=PorterStemmer()
    tokens_stemmed=[porter.stem(token) for token in tokens_no_stopwords]
    
    #lemmatization
    lemmatizer=WordNetLemmatizer()
    tokens_lemmatized=[lemmatizer.lemmatize(token) for token in tokens_no_stopwords]
    
    return tokens_lemmatized

    




In [8]:
df_train['text']=df_train['text'].apply(lambda text: preprocess_text(text))

In [9]:
df_train.head()

Unnamed: 0,id,text,target
0,1,"[deed, reason, earthquake, may, allah, forgive...",1
1,4,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,"[resident, asked, 'shelter, place, notified, o...",1
3,6,"[13,000, people, receive, wildfire, evacuation...",1
4,7,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


In [10]:
df_train['text']=df_train['text'].apply(lambda x:' '.join(x))


In [11]:
df_train.head()

Unnamed: 0,id,text,target
0,1,deed reason earthquake may allah forgive u,1
1,4,forest fire near la ronge sask canada,1
2,5,resident asked 'shelter place notified officer...,1
3,6,"13,000 people receive wildfire evacuation orde...",1
4,7,got sent photo ruby alaska smoke wildfire pour...,1


In [12]:
training_sentences=[]
train_sentences=df_train['text'].values
train_labels=df_train['target'].values

for i in range(train_sentences.shape[0]):
    x=str(train_sentences[i])
    training_sentences.append(x)
training_sentences=np.array(training_sentences)


In [13]:
print(training_sentences)

['deed reason earthquake may allah forgive u'
 'forest fire near la ronge sask canada'
 "resident asked 'shelter place notified officer evacuation shelter place order expected"
 ... 'm1.94 01:04 utc 5km volcano hawaii'
 'police investigating e-bike collided car little portugal e-bike rider suffered serious non-life threatening injury'
 'latest home razed northern california wildfire abc news']


In [14]:
import tensorflow as tf
train_labels=tf.keras.utils.to_categorical(train_labels)

In [15]:
print(train_sentences.shape)
print(train_labels.shape)

(7613,)
(7613, 2)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
train_sentences=[sentence for sentence in train_sentences if isinstance(sentence, str)]
count_vect=CountVectorizer()
X_train_count=count_vect.fit_transform(train_sentences)
print(X_train_count.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
from sklearn.model_selection import StratifiedKFold
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [18]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [27]:
np.random.seed(1)


Pipeline_svm=make_pipeline(count_vect,SVC(probability=True,kernel='linear',class_weight='balanced'))

grid_svm=GridSearchCV(Pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1)

In [29]:
train_labels_new=df_train['target'].values
grid_svm.fit(training_sentences,train_labels_new)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [32]:
df_test['text']=df_test['text'].apply(lambda text: preprocess_text(text))
df_test['text']=df_test['text'].apply(lambda x:' '.join(x))



test_sentences=df_test['text'].values
test_labels=df_train['target'].values
test_labels=test_labels[:len(test_sentences)]


testing_sentences=[sentence for sentence in test_sentences if isinstance(sentence, str)]
count_vect=CountVectorizer()
testing_sentences=count_vect.fit_transform(testing_sentences)
#print(X_train_count.toarray())


testing_sentences=[]
    

for i in range(test_sentences.shape[0]):
    x=str(test_sentences[i])
    testing_sentences.append(x)

testing_sentences=np.array(testing_sentences)

In [33]:
"""test_sentences=df_test['text'].values
test_labels=train_labels_new[:len(test_sentences)]"""

"test_sentences=df_test['text'].values\ntest_labels=train_labels_new[:len(test_sentences)]"

In [34]:
"""print(test_sentences.shape)
print(test_labels.shape)"""

'print(test_sentences.shape)\nprint(test_labels.shape)'

In [35]:
grid_svm.score(testing_sentences,test_labels)

0.5376598482035695