In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import seaborn as sns

plt.style.use('seaborn')

import nltk
import string
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.dtypes

In [None]:
train_df.isnull().sum().plot(kind='bar')
plt.title("Missing values")
plt.show()

In [None]:
target_count = train_df.groupby('target').size().reset_index(name='counts')
plt.bar(target_count.target, target_count.counts)
plt.xticks([0,1], labels=["Not disaster tweets", "disaster tweets"])
plt.title("Target Distribution")
plt.show()

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()

eng_stopwords = set(stopwords.words('english'))

def preprocess_text(val):
    # convert to lower case
    val = val.lower()
    
    val = re.sub(r"http\S+", "", val)
    
    # remove punctuations
    val = "".join([c for c in val if c not in string.punctuation])
    
    # remove digits
    val = re.sub(r"\d", " ", val)
    
    # remove mulitple whitespaces into single
    val = re.sub(r"\s+", " ", val)
    
    # TODO: Should use NLTK's lemmatization
    
    tokens = nltk.word_tokenize(val)

    tokens = [t for t in tokens if t not in eng_stopwords]
    
    tokens = [ps.stem(t) for t in tokens]
    
    return ' '.join(tokens)

In [None]:
train_df['clean_text'] = train_df.text.apply(preprocess_text)

In [None]:
train_df['clean_text_len'] = train_df.clean_text.apply(lambda x:len(x))

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
fig = plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.hist(train_df['clean_text_len'][train_df.target==0])
plt.title("Not disaster tweets")

plt.subplot(1,2,2)
plt.hist(train_df['clean_text_len'][train_df.target==1], color='orange')
plt.title("Disaster tweets")

fig.supxlabel("tweet lenghts")
fig.supylabel("counts")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# from sklearn.preprocessing import Normalizer

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df.clean_text, train_df.target, test_size=0.20, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features=8000, ngram_range=(1,1))

# vectorizer.fit(train_df.clean_text)
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_valid = vectorizer.transform(X_valid)

In [None]:
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier # , GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
models = [  
    {
        'name' : 'Light GBM Classifier',
        'estimator' : LGBMClassifier(),
        'params' : {'n_estimators': (50, 100, 150, 200, 500)}
    }, 
    {
        'name' : 'Random Forest Classifier',
        'estimator' : RandomForestClassifier(),
        'params' : {'n_estimators':(10, 50, 100, 150, 200)}
    },
]

In [None]:
best_estimators = {}
for model in models:
    search = GridSearchCV(model['estimator'], param_grid=model['params'], verbose=1, n_jobs=3)
    search.fit(X_train, y_train)
    score = search.best_estimator_.score(X_valid, y_valid)
    best_estimators[model['name']] = search.best_estimator_
    print("-------------------------------------")
    print("Model:", model["name"], ", Score:", score)
    for param, mean_score, std_score in zip(search.cv_results_['params'], 
                                                search.cv_results_['mean_test_score'], 
                                                search.cv_results_['std_test_score']):
        print(f"{param}, {mean_score:.5f}, (-/+ {std_score:.5f})")
        

In [None]:
best_model = best_estimators['Random Forest Classifier']
best_model.get_params()

In [None]:
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_df['clean_text'] = test_df.text.apply(preprocess_text)
X_test = vectorizer.transform(test_df.clean_text)

In [None]:
scores = best_model.predict(X_test)

In [None]:
scores[:20]

In [None]:
submit_df = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submit_df.target = scores
submit_df.to_csv("submission.csv", index=False)