In [49]:
# importing the required modules

import pandas as pd
import numpy as np

In [50]:
#reading train and test data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [51]:
# checking for missing values
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [52]:
# preprocess the text part of the data removing stopwords, tokenization and lemmatization

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
# count unique values in each columns except text
for col in train.columns:
    if col != 'text':
        print(col, train[col].nunique())
        
for col in train.columns:
    if col != 'text':
        print(col, train[col].unique())
        

id 7613
keyword 221
location 3341
target 2
id [    1     4     5 ... 10871 10872 10873]
keyword [nan 'ablaze' 'accident' 'aftershock' 'airplane%20accident' 'ambulance'
 'annihilated' 'annihilation' 'apocalypse' 'armageddon' 'army' 'arson'
 'arsonist' 'attack' 'attacked' 'avalanche' 'battle' 'bioterror'
 'bioterrorism' 'blaze' 'blazing' 'bleeding' 'blew%20up' 'blight'
 'blizzard' 'blood' 'bloody' 'blown%20up' 'body%20bag' 'body%20bagging'
 'body%20bags' 'bomb' 'bombed' 'bombing' 'bridge%20collapse'
 'buildings%20burning' 'buildings%20on%20fire' 'burned' 'burning'
 'burning%20buildings' 'bush%20fires' 'casualties' 'casualty'
 'catastrophe' 'catastrophic' 'chemical%20emergency' 'cliff%20fall'
 'collapse' 'collapsed' 'collide' 'collided' 'collision' 'crash' 'crashed'
 'crush' 'crushed' 'curfew' 'cyclone' 'damage' 'danger' 'dead' 'death'
 'deaths' 'debris' 'deluge' 'deluged' 'demolish' 'demolished' 'demolition'
 'derail' 'derailed' 'derailment' 'desolate' 'desolation' 'destroy'
 'destroyed'

In [54]:
# i want to check the distribution of the target where location is null and keyword is alsoo null

train[train['keyword'].isnull()]['target'].value_counts()

1    42
0    19
Name: target, dtype: int64

In [55]:
# fill nan values in  location column and drop keyword where it is na

train['location'] = train['location'].fillna('unknown')
test['location'] = test['location'].fillna('unknown')
train['keyword'] = train['keyword'].fillna('none')
test['keyword'] = test['keyword'].fillna('none')


In [56]:
# Train test split for validation
from sklearn.model_selection import train_test_split

X = train.drop('target', axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [58]:
#combine all three columns to create a single text column
X_train['text'] = X_train['keyword'] + ' ' + X_train['location'] + ' ' + X_train['text']
X_val['text'] = X_val['keyword'] + ' ' + X_val['location'] + ' ' + X_val['text']
test['text'] = test['keyword'] + ' ' + test['location'] + ' ' + test['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['keyword'] + ' ' + X_train['location'] + ' ' + X_train['text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val['text'] = X_val['keyword'] + ' ' + X_val['location'] + ' ' + X_val['text']


In [59]:
# Feature extraction using bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_text = vectorizer.fit_transform(X_train['text'])
X_val_text = vectorizer.transform(X_val['text'])
X_test_text = vectorizer.transform(test['text'])




In [61]:
#Training a model with cross validation and f1 score as metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X_train_text, y_train, cv=5, scoring='f1')
scores.mean()

#Training the model
clf.fit(X_train_text, y_train)
y_pred = clf.predict(X_val_text)
f1_score(y_val, y_pred)



0.7116237799467614