In [131]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [34]:

#load clean data

full_train_df = pd.read_csv("data/cleaned_train.csv")
full_train_df.dropna(inplace=True)


print("Train Data shape :", full_train_df.shape)
full_train_df.head()

Train Data shape : (7610, 3)


Unnamed: 0,id,text_cleaned,target
0,1,deeds reason may allah forgive us,1
1,4,forest fire near la ronge sask canada,1
2,5,residents asked shelter place notified officer...,1
3,6,people receive evacuation orders california,1
4,7,got sent photo ruby smoke pours school,1


In [35]:
full_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7610 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7610 non-null   int64 
 1   text_cleaned  7610 non-null   object
 2   target        7610 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 237.8+ KB


### Splitting data

In [36]:
shuffled_train_df = full_train_df.sample(frac=1,
                                    random_state=42,
                                    replace=False)



train_data_split_df = shuffled_train_df[ : 6000]
val_data_split_df = shuffled_train_df[6000 : 6800]
test_data_split_df = shuffled_train_df[6800: ]

train_split_ids = train_data_split_df["id"]
val_split_ids = val_data_split_df["id"]
test_split_ids = val_data_split_df["id"]


print("Train  split shape: ", train_data_split_df.shape)
print("Validation split shape: ", val_data_split_df.shape)
print("Test  split shape: ", test_data_split_df.shape)

Train  split shape:  (6000, 3)
Validation split shape:  (800, 3)
Test  split shape:  (810, 3)


### Establish Baseline: TF-iDF + Logistic Regression

In [108]:
TF_MAX_FEATURES = 4000
TF_MAX_DF=0.98
TF_MIN_DF=0.0001


In [109]:

tfidf_vectorizer = TfidfVectorizer(
                             max_features=TF_MAX_FEATURES,
                             max_df=TF_MAX_DF,
                             min_df=TF_MIN_DF,
                             )


In [110]:
tfidf_vectorizer.fit(train_data_split_df["text_cleaned"])

In [111]:
train_tfidf_matrix  = tfidf_vectorizer.transform(train_data_split_df["text_cleaned"]).toarray()
val_tfidf_matrix = tfidf_vectorizer.transform(val_data_split_df["text_cleaned"]).toarray()
test_tfidf_matrix = tfidf_vectorizer.transform(test_data_split_df["text_cleaned"]).toarray()

In [126]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score



logreg = LogisticRegression(random_state=42)

logreg.fit(train_tfidf_matrix, train_data_split_df["target"])

In [130]:

train_f1 = f1_score(train_data_split_df["target"], logreg.predict(train_tfidf_matrix), average='weighted')
val_f1 = f1_score(val_data_split_df["target"], logreg.predict(val_tfidf_matrix), average='weighted')
test_f1 = f1_score(test_data_split_df["target"], logreg.predict(test_tfidf_matrix), average='weighted')

print("Train F1 Score: ", train_f1)
print("Validation F1 Score: ", val_f1)
print("Test F1 Score: ", test_f1)


Train F1 Score:  0.8674153284658939
Validation F1 Score:  0.7977974666054783
Test F1 Score:  0.7896349694454248


### Saving baseline pipel

In [136]:
pipeline = Pipeline([
    ('TF-IDF', tfidf_vectorizer),
    ('Logistic Regression', logreg)
])