In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import feature_extraction, linear_model, model_selection, preprocessing
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Tweet Preprocessing

Since we are dealing with tweets in this competation, we need to do specific tweet text cleaning along with normal text pre-processing. A tweet may contains
* URL's
* Mentions
* Hashtags
* Emojis
* Specific words etc.

To clean the tweet, we can use a python library tweet-preprocessor instead of writing the cleaning logic ourself.

# Reading Datasets

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv") 

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
!pip install tweet-preprocessor

In [None]:
train_df.count()

# Droppping duplicates and NaN from the DataFrame.

In [None]:
train_df = train_df.dropna()
train_df = train_df.drop_duplicates()

In [None]:
train_df.count()

In [None]:
train_df.head()

# Applying Tweet Processing

Apply tweet preprocessing first. Define a process function and use pandas to apply it on each value of 'text'

In [None]:
import preprocessor as p

def preprocess_tweet(row):
    text = row['text']
    text = p.clean(text)
    return text

In [None]:
train_df['text'] = train_df.apply(preprocess_tweet, axis=1)

### Tweet has been cleaned to normal text.

In [None]:
train_df.head()

# Normal Preprocessing

### Now We can apply bormal text preprocessing like
* Lowercasing
* Punctuation Removal
* Replace Extra white Spaces
* Stopwords removal

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

def stopword_removal(row):
    text = row['text']
    text = remove_stopwords(text)
    return text

In [None]:
train_df['text'] = train_df.apply(stopword_removal, axis = 1)

In [None]:
train_df.head()

## Remove Extra white spaces, punctuation and apply lower casing

In [None]:
train_df['text'] = train_df['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')

In [None]:
train_df.head()

### Now input tweet has been pre-processed and lets find features.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tweets = train_df['text']
vectorizer = TfidfVectorizer(stop_words='english')

# Learn vocabulary from sentences. 
vectorizer.fit(tweets)

# Get vocabularies.
vectorizer.vocabulary_

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

stopwords = stopwords.words('english')

print(stopwords)

count_vector = CountVectorizer(token_pattern = r'\w{1,}', ngram_range = (1, 2), stop_words = stopwords)

In [None]:
from sklearn.model_selection import train_test_split
X = train_df.text
y = train_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = LogisticRegression()
pipe = Pipeline([
    ('count_vector', CountVectorizer()),
    ('clf', LogisticRegression())
])
pipe.fit(X_train, y_train)

In [None]:
from sklearn import metrics
predicted = pipe.predict(X_test)

In [None]:
print("accuracy :", metrics.accuracy_score(predicted, y_test))

# Storing Result

In [None]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['target'] = pipe.predict(test_df.text)
submission.to_csv("submission.csv", index = False)

## Final Result

In [None]:
submission.head(10)