In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import re 
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

import category_encoders as ce 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve,auc 

import warnings
warnings.filterwarnings('ignore')


In [2]:
data_train=pd.read_csv('train.csv')
data_test=pd.read_csv('test.csv')
data_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
data_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
data_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
data_train.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [6]:
data_train.drop(columns=['keyword','location'],axis=1,inplace=True)
data_test.drop(columns=['keyword','location'],axis=1,inplace=True)

In [7]:
data_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
data_train.drop_duplicates('text',keep='first',inplace=True)

In [9]:
data_train.target.value_counts()

target
0    4305
1    3198
Name: count, dtype: int64

In [10]:
data_train['text'].shape

(7503,)

In [11]:
data_train.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [12]:
data_train.drop(columns=['id'],axis=1,inplace=True)
data_test.drop(columns=['id'],axis=1,inplace=True)
data_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
data_test.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [14]:
#preprocess data 

def preprocess_text(text):
    #remove URLs
    text=re.sub(r'http\S+', '', text)
    text=re.sub(r'\[|\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\d', '', text)
    text = re.sub('[“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('[^A-Za-z0-9\s]', '', text)
     
    #Tokenization
    tokens=word_tokenize(text)
    
    #Lowercasing 
    tokens_lower=[token.lower() for token in tokens]
    
    #Removing punctuation 
    tokens_no_punct=[token for token in tokens_lower if token not in string.punctuation ]
    
    #Removing stopwords
    stop_words=set(stopwords.words('english'))
    tokens_no_stopwords=[token for token in tokens_no_punct if token not in stop_words]
    
    #stemming 
    porter=PorterStemmer()
    tokens_stemmed=[porter.stem(token) for token in tokens_no_stopwords]
    
    #lemmatization
    lemmatizer=WordNetLemmatizer()
    tokens_lemmatized=[lemmatizer.lemmatize(token) for token in tokens_no_stopwords]
    
    return tokens_lemmatized


In [15]:
data_train['text']=data_train['text'].apply(lambda text: preprocess_text(text))
data_train['text']=data_train['text'].apply(lambda x:' '.join(x))

In [16]:
data_test['text']=data_test['text'].apply(lambda text: preprocess_text(text))
data_test['text']=data_test['text'].apply(lambda x:' '.join(x))

In [17]:
data_train.head()

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1
3,people receive wildfire evacuation order calif...,1
4,got sent photo ruby alaska smoke wildfire pour...,1


In [18]:
data_test.head()


Unnamed: 0,text
0,happened terrible car crash
1,heard earthquake different city stay safe ever...
2,forest fire spot pond goose fleeing across str...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill china taiwan


In [19]:
X_train=data_train['text'].values
y_train=data_train['target'].values

In [20]:
print("X_train shape:",X_train.shape)
print("y_train shape:",y_train.shape)
print("X_train datatype:",X_train.dtype)
print("y_train datatype:",y_train.dtype)

X_train shape: (7503,)
y_train shape: (7503,)
X_train datatype: object
y_train datatype: int64


In [21]:
X_test=data_test['text'].values
y_test=y_train
y_test=y_test[:len(X_test)]

In [22]:
print("X_test shape:",X_test.shape)
print("y_test shape:",y_test.shape)
print("X_test datatype:",X_test.dtype)
print("y_test datatype:",y_test.dtype)

X_test shape: (3263,)
y_test shape: (3263,)
X_test datatype: object
y_test datatype: int64


# LOGISTIC REGRESSION

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
pipe_lgr=Pipeline([('tfidf',TfidfVectorizer()),
                   ('lgr',LogisticRegression(solver='liblinear'))])
model_lgr=pipe_lgr.fit(X_train,y_train)


In [24]:
from sklearn.metrics import f1_score
y_pred_lgr=model_lgr.predict(X_test)
accuracy=model_lgr.score(X_test,y_test)
print("Accuray of the model :{}".format(accuracy))
f1=f1_score(y_test,y_pred_lgr)
print("f1 score: {}".format(f1))

Accuray of the model :0.5574624578608642
f1 score: 0.399833748960931


# Gradient Boost


In [25]:
from sklearn.ensemble import GradientBoostingClassifier
pipe_gb=Pipeline([('tfidf',TfidfVectorizer()),
                  ('gb',GradientBoostingClassifier(max_depth=100))])
model_gb=pipe_gb.fit(X_train,y_train)
y_pred_gb=model_gb.predict(X_test)
accuracy=model_gb.score(X_test,y_test)
print("Accuray of the model :{}".format(accuracy))
f1=f1_score(y_test,y_pred_gb)
print("f1 score: {}".format(f1))

Accuray of the model :0.5531719276739198
f1 score: 0.4106709781729992


In [27]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

model_xgb=XGBClassifier(max_depth=100)
model_rfc=RandomForestClassifier(max_depth=100)

pipe2 = Pipeline([('tfidf',TfidfVectorizer()),
                ('model',VotingClassifier(estimators=[('XGBoost',model_xgb),('rfc',model_rfc)],voting='hard'))])

model_2=pipe2.fit(X_train,y_train)
print(f1_score(y_test,model_2.predict(X_test)))

0.34557438794726936


In [28]:
from sklearn.tree import DecisionTreeClassifier

pipe_dt=Pipeline([('tfidfv',TfidfVectorizer()),
                  ('model',DecisionTreeClassifier(max_depth=75))])
model_3=pipe_dt.fit(X_train,y_train)
print(f1_score(y_test,model_3.predict(X_test)))

0.3857693956279469
