In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import re
import warnings 

%matplotlib inline
warnings.filterwarnings('ignore')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')

concat_df = pd.concat([test_df, train_df], ignore_index = True)
concat_df = concat_df.sample(frac=1, ignore_index = True)

# **Exploratory Data Analysis**

In [None]:
concat_df

In [None]:
concat_df.sample(5)

In [None]:
concat_df.info()

In [None]:
concat_df.describe()

In [None]:
concat_df.duplicated().sum()

## Null values count

In [None]:
null = pd.DataFrame({'null':concat_df.isna().sum(), 'pct_null': round(concat_df.isna().sum() / len(concat_df), 5) * 100})
null = null.sort_values('pct_null', ascending=False)
null

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20,6))
plt.suptitle('Null Counts', fontsize=20)

ax1.bar(null.index, null.null)
ax2.bar(null.index, null.pct_null)

plt.tight_layout()
plt.show()

## Target distribution

In [None]:
targets = ['0.0', '1.0', 'null']
target_count = [i for i in concat_df.target.value_counts()]
target_count.append(concat_df.target.isna().sum())

target_count

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20,6))
plt.suptitle('Target Counts', fontsize=20)

ax1.bar(targets, target_count)
ax2.pie(target_count, labels=targets, explode=[0.0, 0.0, 0.1], startangle=90, shadow=True, autopct='%1.1f%%')

plt.tight_layout()
plt.show()

# **Feature Engineering**

In [None]:
class FE:
    def __init__(self, df):
        self.df = df
        
    def add_column(self):
        column = Column(self.df)
        column.add_all()
    

class Column(FE):
    
    def __init__(self, df):
        super().__init__(df)
    
    def add_len(self):
        self.df['text_len'] = self.df.text.apply(lambda x: len(x))
        
    def add_tags(self):
        regex = "#(\w+)"
        for index, text in enumerate(self.df.text):
            tags = re.findall(regex, text)
            if len(tags) == 0:
                self.df.loc[index, 'hashtags'] = None
                self.df.loc[index, 'hashtags_count'] = 0
            else:
                self.df.loc[index, 'hashtags'] = ', '.join(tags)
                self.df.loc[index, 'hashtags_count'] = len(tags)
        
    def add_mention(self):
        regex = "@(\w+)"
        for index, text in enumerate(self.df.text):
            mention = re.findall(regex, text)
            if len(mention) == 0:
                self.df.loc[index, 'mention'] = None
                self.df.loc[index, 'mention_count'] = 0
            else:
                self.df.loc[index, 'mention'] = ', '.join(mention)
                self.df.loc[index, 'mention_count'] = len(mention)

    def add_clean_text(self):
        for index, text in enumerate(self.df.text):
            clean_text = re.sub(r'http\S+', '', text)
            clean_text = re.sub(r'[^A-Za-z0-9]+', ' ', clean_text)
            clean_text = re.sub(r'@(\w+)', ' ', clean_text)
            clean_text = re.sub(r'\w*\d\w*', '', clean_text)
            clean_text = clean_text.strip()
            clean_text = re.sub("\s\s+", " ", clean_text)
            self.df.loc[index, 'clean_text'] = clean_text.lower()
            
    def add_has_location(self):
        for index, location in enumerate(self.df.location):
            if isinstance(location, str):
                self.df.loc[index, 'has_location'] = 1
            else:
                self.df.loc[index, 'has_location'] = 0
                
    def add_has_key(self):
        for index, location in enumerate(self.df.keyword):
            if isinstance(location, str):
                self.df.loc[index, 'has_key'] = 1
            else:
                self.df.loc[index, 'has_key'] = 0
                
    def add_all(self):
        self.add_len()
        self.add_tags()
        self.add_mention()
        self.add_clean_text()
        self.add_has_location()
        self.add_has_key()
        
        return self.df

In [None]:
fe = FE(train_df)
fe.add_column()

fe = FE(test_df)
fe.add_column()

# **Data Visualization**

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(20,6))

plt.suptitle('Tweet Text Length', fontsize=20)
sns.boxplot(ax=ax1, data=train_df, x='text_len')
sns.histplot(ax=ax2, data=train_df, x='text_len', hue='target', multiple='stack')

plt.tight_layout()
plt.show()

In [None]:
train_df = train_df[['id', 'keyword','location', 'has_key', 'has_location', 'hashtags', 'hashtags_count', 'mention', 'mention_count', 'text_len', 'target', 'text', 'clean_text']]

In [None]:
sns.countplot(data=train_df, x='has_key', hue='target', edgecolor='black')
plt.title('Has keywords count', fontsize=16)
plt.tight_layout()

In [None]:
sns.countplot(data=train_df, x='has_location', hue='target', edgecolor='black')
plt.title('Has location count', fontsize=16)
plt.tight_layout()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(18,5))
plt.suptitle('Hashtags count', fontsize=20)

sns.histplot(ax=ax1, data=train_df, x='hashtags_count', hue='target', multiple='stack')
sns.boxplot(ax=ax2, data=train_df, x='hashtags_count')

plt.tight_layout()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(18,5))
plt.suptitle('Mention count', fontsize=20)

sns.countplot(ax=ax1, data=train_df, x='mention_count', edgecolor='black')
sns.boxplot(ax=ax2, data=train_df, x='mention_count')

plt.tight_layout()

In [None]:
top5_keys = train_df.keyword.value_counts()[:50].index.tolist()
top5_count = train_df.keyword.value_counts()[:50].tolist()

In [None]:
plt.figure(figsize=(20,16))
plt.title('Top 50 Keywords', fontsize=20)

sns.barplot(x=top5_count, y=top5_keys, palette='rocket')

plt.tight_layout()

In [None]:
target1_words = ''
target0_words = ''

stopwords = set(STOPWORDS)

for i in train_df[train_df.target == 1.0].clean_text:
    target1_words += i

for i in train_df[train_df.target == 0.0].clean_text:
    target0_words += i


fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(16,7))    

wordcloud1 = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(target1_words)

wordcloud0 = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(target0_words)
 
    
ax1.imshow(wordcloud1)
ax1.set_title('Target 1 Word Cloud', fontsize=18)

ax2.imshow(wordcloud0)
ax2.set_title('Target 0 Word Cloud', fontsize=18)
    
plt.show()

# **Data Preprocessing**

### **Balance the number of target 0 and 1**

In [None]:
print(train_df.target.value_counts())

In [None]:
class Data:
    def __init__(self, data):
        self.data = data
        
    def fix(self):
        minimum = min(train_df.target.value_counts().tolist())
        
        fixed_data = pd.concat([self.data[self.data.target == 1.0][:minimum],
                               self.data[self.data.target == 0.0][:minimum]])
        
        fixed_data = fixed_data.sample(frac=1).reset_index()
        return fixed_data

In [None]:
data = Data(train_df)
train_df = data.fix()

train_df.target.value_counts()

### **Train Test Split**

In [None]:
X = train_df.clean_text
y = train_df.target

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=1)

### **Vectorize Text**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

# **Creating Models**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
class Model:
    def __init__(self, model):
        self.model = model
    
    def predict(self):
        self.model.fit(train_x_vectors, train_y)
        model_pred = self.model.predict(test_x_vectors)
        self.get_score(model_pred)
        
    def get_score(self, model_pred):
        model_acc = accuracy_score(model_pred, test_y)
        model_report = classification_report(model_pred, test_y)        
        model_f1 = f1_score(model_pred, test_y)        
        model_cv = cross_val_score(self.model, train_x_vectors, train_y, cv=5)
        
        print(f'Accuracy Score: {model_acc}\nMean Cross Validation: {np.mean(model_cv)}\n\n{model_report}\nf1_score: {model_f1}')

### **Linear SVC**

In [None]:
from sklearn.svm import LinearSVC
model = Model(LinearSVC())
model.predict()

### **Support Vector Machine**

In [None]:
from sklearn.svm import SVC
model = Model(SVC())
model.predict()

### **Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = Model(RandomForestClassifier())
model.predict()

### **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = Model(DecisionTreeClassifier())
model.predict()

### **K-NN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = Model(KNeighborsClassifier())
model.predict()

### **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
model = Model(LogisticRegression())
model.predict()

### **Naive Bayesian**

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = Model(MultinomialNB())
model.predict()

# **Hyperparameter Tuning**
I'll be using logistic regression since it has the highest cross validation score

In [None]:
model = LogisticRegression()
params = {
    'solver':['newton-cg', 'lbfgs', 'liblinear'],
    'penalty':['l2'],
    'C':[100, 10, 1.0, 0.1, 0.01]
}

logistic = GridSearchCV(estimator=model, param_grid=params, cv=5)

logistic.fit(train_x_vectors, train_y)
logistic_pred = logistic.predict(test_x_vectors)

score = pd.DataFrame(logistic.cv_results_)

In [None]:
print(f'Accuracy Score: {accuracy_score(logistic_pred, test_y)}')

In [None]:
score[['param_solver', 'param_C', 'param_penalty', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

# **Submit Result**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()

train_vec = vec.fit_transform(train_df.clean_text)
test_vec = vec.transform(test_df.clean_text)

In [None]:
final_model = LogisticRegression()
final_model.fit(train_vec, train_df.target)

model_pred = final_model.predict(test_vec)

In [None]:
output = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
output['target'] = model_pred

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")