In [1]:
# importing the required modules

import pandas as pd
import numpy as np

In [2]:
#reading train and test data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# checking for missing values
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
# preprocess the text part of the data removing stopwords, tokenization and lemmatization

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# count unique values in each columns except text
for col in train.columns:
    if col != 'text':
        print(col, train[col].nunique())
        
for col in train.columns:
    if col != 'text':
        print(col, train[col].unique())
        

id 7613
keyword 221
location 3341
target 2
id [    1     4     5 ... 10871 10872 10873]
keyword [nan 'ablaze' 'accident' 'aftershock' 'airplane%20accident' 'ambulance'
 'annihilated' 'annihilation' 'apocalypse' 'armageddon' 'army' 'arson'
 'arsonist' 'attack' 'attacked' 'avalanche' 'battle' 'bioterror'
 'bioterrorism' 'blaze' 'blazing' 'bleeding' 'blew%20up' 'blight'
 'blizzard' 'blood' 'bloody' 'blown%20up' 'body%20bag' 'body%20bagging'
 'body%20bags' 'bomb' 'bombed' 'bombing' 'bridge%20collapse'
 'buildings%20burning' 'buildings%20on%20fire' 'burned' 'burning'
 'burning%20buildings' 'bush%20fires' 'casualties' 'casualty'
 'catastrophe' 'catastrophic' 'chemical%20emergency' 'cliff%20fall'
 'collapse' 'collapsed' 'collide' 'collided' 'collision' 'crash' 'crashed'
 'crush' 'crushed' 'curfew' 'cyclone' 'damage' 'danger' 'dead' 'death'
 'deaths' 'debris' 'deluge' 'deluged' 'demolish' 'demolished' 'demolition'
 'derail' 'derailed' 'derailment' 'desolate' 'desolation' 'destroy'
 'destroyed'

In [6]:
# i want to check the distribution of the target where location is null and keyword is alsoo null

train[train['keyword'].isnull()]['target'].value_counts()

1    42
0    19
Name: target, dtype: int64

In [7]:
# fill nan values in  location column and drop keyword where it is na

train['location'] = train['location'].fillna('unknown')
test['location'] = test['location'].fillna('unknown')
train['keyword'] = train['keyword'].fillna(train['keyword'].mode()[0])
test['keyword'] = test['keyword'].fillna(test['keyword'].mode()[0])	


In [8]:
# Train test split for validation
from sklearn.model_selection import train_test_split

X = train.drop('target', axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
#combine all three columns to create a single text column
#X_train['text'] = X_train['keyword'] + ' ' + X_train['location'] + ' ' + X_train['text']
#X_val['text'] = X_val['keyword'] + ' ' + X_val['location'] + ' ' + X_val['text']
#test['text'] = test['keyword'] + ' ' + test['location'] + ' ' + test['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['keyword'] + ' ' + X_train['location'] + ' ' + X_train['text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val['text'] = X_val['keyword'] + ' ' + X_val['location'] + ' ' + X_val['text']


In [None]:
# vectorizing keyword and location columns
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_keyword = vectorizer.fit_transform(X_train['keyword'])
X_val_keyword = vectorizer.transform(X_val['keyword'])
test_keyword = vectorizer.transform(test['keyword'])

X_train_location = vectorizer.fit_transform(X_train['location'])
X_val_location = vectorizer.transform(X_val['location'])
test_location = vectorizer.transform(test['location'])


In [10]:
# Feature extraction using bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_text = vectorizer.fit_transform(X_train['text'])
X_val_text = vectorizer.transform(X_val['text'])
X_test_text = vectorizer.transform(test['text'])

# Feature extraction using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_text = tfidf.fit_transform(X_train['text'])
X_val_text = tfidf.transform(X_val['text'])
X_test_text = tfidf.transform(test['text'])






In [11]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Assuming X_train_split, X_val, y_train_split, y_val are already defined
# Define the pipeline
pipeline = Pipeline([
	('clf', LogisticRegression())
])

# Define the parameter grid
param_grid = [
	{
		'clf': [LogisticRegression()],
		'clf__C': [0.1, 1, 10]
	},
	{
		'clf': [RandomForestClassifier()],
		'clf__n_estimators': [50, 100, 200],
		'clf__max_depth': [None, 10, 20]
	},
	{
		'clf': [MultinomialNB()],
		'clf__alpha': [0.01, 0.1, 1]
	},
	{
		'clf': [XGBClassifier()],
		'clf__n_estimators': [50, 100, 200],
		'clf__max_depth': [3, 6, 9],
		'clf__learning_rate': [0.01, 0.1, 0.2]
	},
	{
		'clf': [SVC()],
		'clf__C': [0.1, 1, 10],
		'clf__kernel': ['linear', 'rbf']
	}
]

# Perform Grid Search CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_text, y_train)

# Best model and parameters
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Evaluate on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val_text)
from sklearn.metrics import f1_score
val_f1_score = f1_score(y_val, val_predictions)
print("Validation F1 Score:", val_f1_score)

Best Model: Pipeline(steps=[('clf', MultinomialNB(alpha=0.1))])
Best Parameters: {'clf': MultinomialNB(alpha=0.1), 'clf__alpha': 0.1}
Best F1 Score: 0.7435283222858984
Validation F1 Score: 0.7463884430176565


In [12]:
#Training a model with cross validation and f1 score as metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

#Training the model
best_model.fit(X_train_text, y_train)
y_pred = best_model.predict(X_val_text)
f1_score(y_val, y_pred)



0.7463884430176565

In [13]:
#Predicting on test data
y_test = best_model.predict(X_test_text)

#Saving the predictions in submission.csv with same format as sample sumission
submission = pd.read_csv('data/sample_submission.csv')

submission['target'] = y_test
submission.to_csv('submission.csv', index=False)

