In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from unicodedata import normalize
import emoji
from textblob import TextBlob
from nltk.corpus import stopwords
import wordsegment as ws
ws.load()
import contractions

In [None]:
!pip install contractions

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [None]:
df = train_df.append(test_df)
df

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df['keyword'].unique()

# Data Preprocessing on keyword feature

In [None]:
df['keyword'] = df['keyword'].str.replace('%20', ' ')

In [None]:
df.info()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def extract_words(tweet):
    hashtags = re.findall(r"(#\w+)", tweet)
    for hs in hashtags:
        words = " ".join(ws.segment(hs))
        tweet = tweet.replace(hs, words)
    return tweet

In [None]:
def remove_urls(tweet):
    return re.sub(r'http\S+', '', tweet)

In [None]:
def remove_numbers(tweet):
    return re.sub(r'[^\D\.]', '', tweet)

In [None]:
def remove_usermentions(tweet):
    return re.sub(r'@(\w+)', '', tweet)

In [None]:
def lowercase_words(df, index):
    text = ''

    words = df['text'][index].split(' ')
    
    for i in range(len(words)):
        if words[i] not in stop_words:
            text += words[i]
            text += ' '
    text = text.rstrip()
    df.at[index, 'text'] = text
    
    return df['text'][index]

In [None]:
def remove_punctuations(tweet):
    return re.sub(r'[^\w\s]', '', tweet)

In [None]:
def correct_spellings(tweet):
    return str(TextBlob(tweet).correct())

In [None]:
for index in df.index:
    
    # remove unicode characters
    df.at[index, 'text'] = (normalize('NFKD', df['text'][index]).encode('ascii','ignore')).decode('utf-8')
    
    # remove user mentions
    df.at[index, 'text'] = remove_usermentions(df['text'][index])
    
    # remove hashtags and splits the words
    df.at[index, 'text'] = extract_words(df['text'][index])
    
    # remove contractions
    df.at[index, 'text'] = contractions.fix(df['text'][index])
    
    # convert emojis into text
    df.at[index, 'text'] = emoji.demojize(df['text'][index], delimiters=("", ""))
    
    # remove urls
    df.at[index, 'text'] = remove_urls(df['text'][index])
    
    # remove numbers
    df.at[index, 'text'] = remove_numbers(df['text'][index])
    
    # remove punctuations
    df.at[index, 'text'] = remove_punctuations(df['text'][index])
    
    # correct spelling mistakes
    df.at[index, 'text'] = correct_spellings(df['text'][index])
    
    # lower casing the words
    df.at[index, 'text'] = lowercase_words(df, index)
    
    # removing unwanted white spaces
    df.at[index, 'text'] = ' '.join(df['text'][index].split())

In [None]:
for index in df.index:
    df.at[index, 'text'] = df['text'][index].lower()

In [None]:
df.to_csv('nlp_tweets.csv')

In [None]:
df