# 1: EDA and Data Processing

In [1]:
import os

%matplotlib inline
import string
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import altair as alt

# data
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer

# Feature selection
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer

# classifiers / models
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV

# other
from sklearn.metrics import accuracy_score, log_loss, make_scorer, mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler,
)
from sklearn.svm import SVC, SVR

### Read in the data

In [2]:
train_df = pd.read_csv("data/train.csv")

In [3]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
# no class imbalance
train_df["target"].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [5]:
# should have 7613 rows
train_df.shape

(7613, 5)

In [6]:
# many null values in keyword and location (<7613)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
# Initial data exploration shows that we have different types of features (categorical, numerical, and binary) and some of these features have 
# misssing values. The dataset is large and representative, and our target column is already labelled.

train_df.describe(include = 'all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [8]:
# remove NULL values from keyword

train_df = train_df.dropna()

In [9]:
train_df.shape

(5080, 5)

In [10]:
alt.data_transformers.disable_max_rows()

keyword_plot = alt.Chart(train_df).mark_rect().encode(
    y = alt.Y("target", type = "ordinal", title = "Target"),
    x = alt.X("keyword", title = "Keyword"),
    color = ('count()'), 
    ).properties(width = 5000, height = 120)

keyword_plot

### Data processing

In [11]:
train_df = train_df.drop(columns = ["id", "location"])
train_df.shape

(5080, 3)

In [12]:
train_df

Unnamed: 0,keyword,text,target
31,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,ablaze,We always try to bring the heavy. #metal #RT h...,0
33,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,ablaze,Crying out for more! Set me ablaze,0
35,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...
7575,wrecked,On the bright side I wrecked http://t.co/uEa0t...,0
7577,wrecked,@widda16 ... He's gone. You can relax. I thoug...,0
7579,wrecked,Three days off from work and they've pretty mu...,0
7580,wrecked,#FX #forex #trading Cramer: Iger's 3 words tha...,0


### split the data into x and y

In [13]:
text_feature = "text"
target = "target"

X_train, y_train = train_df.drop(columns=["target"]), train_df[target]

### label the features

In [14]:
text_feature = "text"
target = "target"
ordinal_feature = "keyword"

### create transformers and preprocessor

In [15]:
preprocessor = make_column_transformer(
    (CountVectorizer(max_features = 20_000), "text"), # bag of words for text feature
    (OneHotEncoder(handle_unknown = "ignore"), ["keyword"]),
)

In [16]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation
    """
    scores = cross_validate(model, 
                            X_train, y_train, 
                            **kwargs)    
    
    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):  
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data = out_col, index = mean_scores.index)

In [17]:
results = {}

In [18]:
dummy_pipe = make_pipeline(preprocessor, DummyClassifier(strategy='stratified'))
results['Dummy'] = mean_std_cross_val_scores(dummy_pipe, X_train, y_train, return_train_score = True)

pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.123 (+/- 0.025)
score_time,0.024 (+/- 0.004)
test_score,0.513 (+/- 0.017)
train_score,0.506 (+/- 0.013)


In [19]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

pipe_lr = make_pipeline(preprocessor, LogisticRegression(max_iter=2000, random_state=2))
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state=2))
pipe_xgb = make_pipeline(preprocessor, XGBClassifier(random_state=2))
pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state=2))
pipe_catboost = make_pipeline(preprocessor, CatBoostClassifier(verbose=0, random_state=2))
classifiers = {
    'logistic regression' : pipe_lr,
    'random forest' : pipe_rf,
    'XGBoost' : pipe_xgb, 
    'LightGBM' : pipe_lgbm,
    'CatBoost' : pipe_catboost
}

In [20]:
for classifier_name, classifier in classifiers.items():
    results[classifier_name] = mean_std_cross_val_scores(classifier, X_train, y_train, return_train_score=True)

results = pd.DataFrame(results)
results

Unnamed: 0,Dummy,logistic regression,random forest,XGBoost,LightGBM,CatBoost
fit_time,0.123 (+/- 0.025),0.366 (+/- 0.040),2.680 (+/- 0.117),1.156 (+/- 0.141),0.438 (+/- 0.084),29.545 (+/- 0.873)
score_time,0.024 (+/- 0.004),0.024 (+/- 0.004),0.105 (+/- 0.011),0.037 (+/- 0.005),0.032 (+/- 0.006),0.050 (+/- 0.004)
test_score,0.513 (+/- 0.017),0.698 (+/- 0.045),0.687 (+/- 0.036),0.682 (+/- 0.041),0.704 (+/- 0.029),0.685 (+/- 0.036)
train_score,0.506 (+/- 0.013),0.980 (+/- 0.001),0.998 (+/- 0.000),0.876 (+/- 0.006),0.864 (+/- 0.005),0.860 (+/- 0.005)


In [None]:
def has_link(text):
    """
    Returns true or false if a website link is present in the tweet.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    link present or not: (boolean)

    """
    
    for char in text.split(' '): # split text up by space
        if char.startswith('https'):
            return True
        else:
            return False

def has_hashtag(text):
    """
    Returns true or false if a hashtag is present in the tweet.
    Parameters:
    ------
    text: (str)
    the input text
    Returns:
    -------
    hashtag present or not: (boolean)
    """
    
    for char in text.split(' '): # split text up by space
        if char.startswith('#'):
            return True
        
        return False

train_df = train_df.assign(link=train_df["text"].apply(has_link))
train_df = train_df.assign(hashtag=train_df["text"].apply(has_hashtag))