## **SKLEARN LOGISTIC REGRESSION for NLP**

    - Basic EDA
    - Data Cleaning
    - SKLEARN Model

In [None]:
# Library

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import  f1_score, classification_report

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
# remove this columns because din't need 
train = train.drop(['keyword','location'], axis=1)
test = test.drop(['keyword','location'], axis=1)

In [None]:
# Check Nan / missing value
print(train.isna().sum())
print(test.isna().sum())

In [None]:
#Visualizing tweet length by characaters
plt.figure(figsize=(12,6))
train_sent = train['text'].str.len()
sns.boxplot(x="target",y=train_sent,data=train,palette="Set1")
plt.xlabel("Tweet Fallacy")
plt.ylabel("Tweet Length by character")
plt.show()

**Dissaster tweet tends have long length character**

In [None]:
#Visualizing tweet length by words
plt.figure(figsize=(12,6))
train_sent = train['text'].str.split().map(lambda x : len(x))
sns.boxplot(x="target",y=train_sent,data=train,palette="Set3")
plt.xlabel("Tweet Fallacy")
plt.ylabel("Tweet length by word")
plt.show()

**Dissaster tweet length similiar**

## **Data Cleaning**

In [None]:
#Removing URLS
def url_clean(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [None]:
#Removing HTML tags
def html_clean(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# lower case
def to_lower(text):
    return text.lower()

In [None]:
# Contraction
contractions_dict = {     
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "iit will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def main_contraction(text):
    text = expand_contractions(text, contractions_dict)
    return text

In [None]:
# remove Punctuation
from string import punctuation
def remove_punct(text):
    return ''.join(x for x in text if x not in punctuation)

In [None]:
# remove whitespace
def to_strip(text):
    return " ".join(text.split())

In [None]:
# remove stopwords
def remove_stopwords(sentence):
    stop_words = stopwords.words('english')
    return ' '.join([x for x in nltk.word_tokenize(sentence) if not x in stop_words])

In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    return " ".join(lemmatized_word)

**Apply to Test & Train Data**

In [None]:
train['text']= train['text'].apply(lambda x : url_clean(x))
train['text']= train['text'].apply(lambda x : html_clean(x))
train['text']= train['text'].apply(lambda x : remove_emoji(x))
train['text']= train['text'].apply(lambda x :to_lower(x))
train['text']= train['text'].apply(lambda x : main_contraction(x))
train['text']= train['text'].apply(lambda x : remove_punct(x))
train['text']= train['text'].apply(lambda x : to_strip(x))
train['text']= train['text'].apply(lambda x : remove_stopwords(x))
train['text'] = train['text'].apply(lemmatize)
train['text'].head(5)

In [None]:
test['text']= test['text'].apply(lambda x : url_clean(x))
test['text']= test['text'].apply(lambda x : html_clean(x))
test['text']= test['text'].apply(lambda x : remove_emoji(x))
test['text']= test['text'].apply(lambda x :to_lower(x))
test['text']= test['text'].apply(lambda x : main_contraction(x))
test['text']= test['text'].apply(lambda x : remove_punct(x))
test['text']= test['text'].apply(lambda x : to_strip(x))
test['text']= test['text'].apply(lambda x : remove_stopwords(x))
test['text'] =test['text'].apply(lemmatize)


## **Text Exploration**

In [None]:
# Word Frequency

def mydict(check):
    check = check.str.extractall('([a-zA_Z]+)')
    check.columns = ['check']
    b = check.reset_index(drop=True)
    check = b['check'].value_counts()

    mydict = {'word':check.index,'freq':check.values}
    mydict = pd.DataFrame(mydict)
    mydict.index = mydict['word']
    mydict.drop('word', axis = 1, inplace = True)
    mydict.sort_values('freq', ascending=False, inplace=True)
    
    return mydict

In [None]:
mydict_unclean = mydict(train['text'])

sns.set_context(context = 'notebook', font_scale = 1.6)
mydict_unclean[:20].plot(kind = 'barh', figsize = (10, 10));

In [None]:
# Word Cloud
from wordcloud import WordCloud
def plot_cloud(wordcloud):
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud) 
    plt.axis('off')

In [None]:
word_cloud = WordCloud().generate(str(train['text']))
plot_cloud(word_cloud)

In [None]:
# word cloud dissaster
word_cloud_dissaster = WordCloud().generate(str(train[train['target'] == 1]['text']))
plot_cloud(word_cloud_dissaster)

In [None]:
train['length'] = train['text'].apply(len) 
plt.figure(figsize = (10, 10))

df_dissater = train[train['target'] == 1]
df_non = train[train['target'] == 0]

sns.distplot(df_dissater['length'])
sns.distplot(df_non['length'])

plt.legend(['Dissaster', 'Not']);

## **Data Spliting & Modeling on train set**

In [None]:
X = train['text']
y = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0,test_size=0.15)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

**Using multiple Model to check what best model used for**

In [None]:
# Define models
knn = KNeighborsClassifier()
tree = DecisionTreeClassifier(random_state=0)
logreg = LogisticRegression(random_state=0)
rf = RandomForestClassifier(random_state=0)
xgb = XGBClassifier()

models = [knn, tree, logreg, rf,xgb]
score = []
score_mean = []
score_std = []

for i in models:
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
   
    model_cv = cross_val_score(i, X_train, y_train, cv=skfold, scoring='f1', error_score='raise')
    score.append(model_cv)
    score_mean.append(model_cv.mean())
    score_std.append(model_cv.std())

In [None]:
pd.DataFrame({
    'model':['knn', 'tree', 'logreg', 'rf','xgb'],
    'mean':score_mean,
    'std':score_std
})

**Logistic regression, Random Forest, XGB are tops picks and we check with their test score**

In [None]:
models={
    'Logistic Regression': LogisticRegression(random_state=0),
    'Random Forest': RandomForestClassifier(random_state=0),
    'XGB' : XGBClassifier()  
}

score=[]

for i in models:
    models[i].fit(X_train, y_train)
    y_pred=models[i].predict(X_test)
    score.append([f1_score(y_test, y_pred)])

score_before_tuning = pd.DataFrame(score, columns=['f1 score'], index = models.keys())
score_before_tuning


In [None]:
# Hyperparameter Tunning
# Logistic Regression
hyperparam_space_logreg = {
    'model__C': [1000, 100, 10, 1, 0.1, 0.01],
    'model__solver': ['liblinear', 'newton-cg', 'lbfgs'],
    'model__max_iter': [100, 200, 300],
    'model__penalty': ['l1', 'l2', 'elasticnet', 'none']
}



**These are the keys hyper parameter but i only choose some of it that i think important**

**Hyperparameter tunning for best logistic regression model**


In [None]:
# Define vectorizer & models
vect = TfidfTransformer()

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model = LogisticRegression(random_state=0)


# Pipeline
pipe = Pipeline([
    ('preprocessing', vect),
    ('model', model)
])


# Tuning
grid_logreg = GridSearchCV(
    pipe,
    param_grid = hyperparam_space_logreg,
    cv = skfold,
    scoring= 'f1',
    n_jobs= -1
)

In [None]:
grid_logreg.fit(X_train, y_train)

In [None]:
print('Best score logreg:', grid_logreg.best_score_)
print('Best params logreg:', grid_logreg.best_params_)

**Remodeling and Refitting Using Best model**

In [None]:
logreg_tuning = grid_logreg.best_estimator_
logreg_tuning.fit(X_train, y_train)

y_pred_logreg_tuning = logreg_tuning.predict(X_test)

accuracy_logreg_tuning = f1_score(y_test, y_pred_logreg_tuning)

In [None]:
score_list = [accuracy_logreg_tuning]
models = ['Logistic Regression']

score_after_tuning = pd.DataFrame({
    'Model': models,
    'accuracy score': score_list
})

In [None]:
score_after_tuning

In [None]:
print('Before tuning\n', classification_report(y_test, y_pred))
print('After tuning\n', classification_report(y_test, y_pred_logreg_tuning))

**The accuracy increase a little after hyper parameter tunning**

### **Subsmission**

In [None]:
X_train = train["text"]
y_train = train["target"]

In [None]:
tfid = TfidfTransformer()
vect = CountVectorizer()
pipe = Pipeline([('vectorizer',vect),
    ('preprocessing', tfid),
    ('model',logreg_tuning )])


In [None]:
pipe.fit(X_train, y_train)

In [None]:
X_test = test['text']
preds = pipe.predict(X_test)

In [None]:
ids = test["id"]
submission_df = pd.DataFrame({"id": ids, "target": preds})
submission_df.reset_index(drop=True, inplace=True)

In [None]:
len(X_test)

In [None]:
test

In [None]:
submission_df.to_csv("submission.csv", index=False)