# Imports

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from nltk.stem import PorterStemmer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc, precision_recall_curve, f1_score

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_ids = train['id']
test_ids = test['id']

# Data Cleaning

## Fill missing 'keyword' and 'location' values

In [None]:
# Missing values
print(train.isnull().sum())

In [None]:
train[['keyword', 'location']] = train[['keyword', 'location']].fillna('')
test[['keyword', 'location']] = test[['keyword', 'location']].fillna('')

## Remove user handles

In [None]:
def remove_pattern(input_text, pattern):
    handles = re.findall(pattern, input_text)
    for handle in handles:
        input_text = re.sub(handle, '', input_text)
        
    return input_text

In [None]:
# remove user handles (@user)
train['text_cleaned'] = np.vectorize(remove_pattern)(train['text'], "@[\w]*")
test['text_cleaned'] = np.vectorize(remove_pattern)(test['text'], "@[\w]*")

## Remove special characters

In [None]:
def remove_special_char(cols):
    return cols.str.replace("[^a-zA-Z#]", " ")

In [None]:
train[['text_cleaned', 'keyword', 'location']] = train[['text_cleaned', 'keyword', 'location']].apply(remove_special_char)

## Remove stop words

In [None]:
train['text_cleaned'] = train['text_cleaned'].apply(lambda x: ' '.join([w for w in x.split() if w not in ENGLISH_STOP_WORDS]))
test['text_cleaned'] = test['text_cleaned'].apply(lambda x: ' '.join([w for w in x.split() if w not in ENGLISH_STOP_WORDS]))

## Stemming

In [None]:
stemmer = PorterStemmer()
tokenized_tweets = train['text_cleaned'].apply(lambda x: x.split())
train['text_stems'] = tokenized_tweets.apply(lambda x: [stemmer.stem(i) for i in x])

tokenized_tweets = test['text_cleaned'].apply(lambda x: x.split())
test['text_stems'] = tokenized_tweets.apply(lambda x: [stemmer.stem(i) for i in x])

In [None]:
train['text_stemmed'] = train['text_stems'].apply(lambda x: ' '.join(x))
test['text_stemmed'] = test['text_stems'].apply(lambda x: ' '.join(x))

# Feature Engineering

## Add column for tweet length

In [None]:
train['length_raw'] = train['text'].apply(lambda x: len(x))
test['length_raw'] = test['text'].apply(lambda x: len(x))

## Add indicator column for user handles

In [None]:
def contains_handle(col):
    if re.search("@[\w]*", col):
        return 1
    else:
        return 0

In [None]:
train['contains_handle'] = train['text'].apply(contains_handle)
test['contains_handle'] = test['text'].apply(contains_handle)

## Add a length column for cleaned tweets

In [None]:
train['length_cleaned'] = train['text_cleaned'].apply(lambda x: len(x))
test['length_cleaned'] = test['text_cleaned'].apply(lambda x: len(x))

# Preparation

## Drop unnecessary columns

In [None]:
train = train.drop(['text', 'text_cleaned', 'text_stems', 'id'], axis=1)
test = test.drop(['text', 'text_cleaned', 'text_stems', 'id'], axis=1)

# Feature Extraction

## Vectorization

In [None]:
X_train = train.drop(['target'], axis=1)
X_test = test
y_train = np.array(train['target'])

In [None]:
# Copy data frames with text transformed with CountVectorizer
X_train_vect = X_train
X_test_vect = X_test

keyword_vect = CountVectorizer(min_df=3).fit(X_train_vect['keyword'])
keyword_df = pd.DataFrame(keyword_vect.transform(X_train_vect['keyword']).todense(), columns=keyword_vect.get_feature_names())
X_train_vect = pd.concat([keyword_df, X_train_vect], axis=1)
keyword_df = pd.DataFrame(keyword_vect.transform(X_test_vect['keyword']).todense(), columns=keyword_vect.get_feature_names())
X_test_vect = pd.concat([keyword_df, X_test_vect], axis=1)

location_vect = CountVectorizer(min_df=3, ngram_range=(1, 2)).fit(X_train_vect['location'])
location_df = pd.DataFrame(location_vect.transform(X_train_vect['location']).todense(), columns=location_vect.get_feature_names())
X_train_vect = pd.concat([location_df, X_train_vect], axis=1)
location_df = pd.DataFrame(location_vect.transform(X_test_vect['location']).todense(), columns=location_vect.get_feature_names())
X_test_vect = pd.concat([location_df, X_test_vect], axis=1)

text_vect = CountVectorizer(min_df=3, ngram_range=(1, 2)).fit(X_train_vect['text_stemmed'])
text_df = pd.DataFrame(text_vect.transform(X_train_vect['text_stemmed']).todense(), columns=text_vect.get_feature_names())
X_train_vect = pd.concat([text_df, X_train_vect], axis=1)
text_df = pd.DataFrame(text_vect.transform(X_test_vect['text_stemmed']).todense(), columns=text_vect.get_feature_names())
X_test_vect = pd.concat([text_df, X_test_vect], axis=1)

## Polynomial Features

In [None]:
lengths_df = X_train_vect[['length_raw', 'length_cleaned']]
poly = PolynomialFeatures(2).fit(lengths_df)
X_train_poly = pd.DataFrame(poly.transform(lengths_df), columns=poly.get_feature_names())
X_train_combo = pd.concat([X_train_vect, X_train_poly], axis=1)
X_train_combo.drop(['text_stemmed', 'keyword', 'location'], axis=1, inplace=True)

In [None]:
lengths_df = X_test_vect[['length_raw', 'length_cleaned']]
poly = PolynomialFeatures(2).fit(lengths_df)
X_test_poly = pd.DataFrame(poly.transform(lengths_df), columns=poly.get_feature_names())
X_test_combo = pd.concat([X_test_vect, X_test_poly], axis=1)
X_test_combo.drop(['text_stemmed', 'keyword', 'location'], axis=1, inplace=True)

# Logistic Regression

In [None]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', LogisticRegression(solver='liblinear'))
])

## Tune parameters

In [None]:
param_grid = {
    'clf__penalty': ['l2'],
    'clf__C': np.logspace(0.1, 0.2, 10),
    'clf__solver': ['liblinear']
}

In [None]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train_combo, y_train)

print("Best cross-validation score:", grid_search.best_score_)
print("Best parameters:\n", grid_search.best_params_)

## Best model

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train_combo, y_train, random_state=42)

In [None]:
clf = LogisticRegression(C=1.2589, penalty='l2', solver='liblinear')

In [None]:
clf.fit(Xtrain, ytrain)

## Evaluation

In [None]:
ypred = clf.predict_proba(Xtest)[:, 1]

scores = []
for p in ypred:
    if p >= 0.56:
        scores.append(1)
    else:
        scores.append(0)

score = np.sum(scores == ytest) / len(scores)
print(score)

## Output

In [None]:
clf.fit(X_train_combo, y_train)

In [None]:
ypred = clf.predict_proba(X_test_combo)[:, 1]
scores = []
for p in ypred:
    if p >= 0.56:
        scores.append(1)
    else:
        scores.append(0)

In [None]:
output = pd.DataFrame(zip(test_ids, scores), columns=['id', 'target'])
output.to_csv('lr_results.csv', index=None, header=True)

# Linear SVC

In [None]:
pipe = Pipeline([
    ('clf', LinearSVC(max_iter=20000))
])

## Tune parameters

In [None]:
param_grid = [
    {'clf__C': np.linspace(0.0001, 10, 10)},
]

In [None]:
np.linspace(0.0001, 10, 10)

In [None]:
#grid_search = GridSearchCV(pipe, param_grid, cv=3, return_train_score=True)
#grid_search.fit(X_train_combo, y_train)

#print("Best cross-validation score:", grid_search.best_score_)
#print("Best parameters:\n", grid_search.best_params_)

# Naive Bayes

In [None]:
pipe = Pipeline([
    ('clf', BernoulliNB())
])

## Tune parameters

In [None]:
param_grid = [
    {'clf': [BernoulliNB(), MultinomialNB()],
     'clf__alpha': np.linspace(1.1, 3.33, 10)},
]

In [None]:
#grid_search = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True)
#grid_search.fit(X_train_combo, y_train)

#print("Best cross-validation score:", grid_search.best_score_)
#print("Best parameters:\n", grid_search.best_params_)

## Best model

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train_combo, y_train, random_state=42)

In [None]:
clf = BernoulliNB(alpha=1.84)

In [None]:
clf.fit(Xtrain, ytrain)

## Evaluation

In [None]:
ypred = clf.predict_proba(Xtest)[:, 1]

scores = []
for p in ypred:
    if p >= 0.51:
        scores.append(1)
    else:
        scores.append(0)

score = np.sum(scores == ytest) / len(scores)
print(score)

## Output

In [None]:
clf.fit(X_train_combo, y_train)

In [None]:
ypred = clf.predict_proba(X_test_combo)[:, 1]
scores = []
for p in ypred:
    if p >= 0.56:
        scores.append(1)
    else:
        scores.append(0)

In [None]:
output = pd.DataFrame(zip(test_ids, scores), columns=['id', 'target'])
output.to_csv('nb_results.csv', index=None, header=True)