In [None]:
#import required stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

import re #for working with regular expression
import nltk #for natural language processing (nlp)
import spacy #also for nlp
import string #This is a module, Python also has built-in class str, these are different

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
print('train dataframe : \n',train_df.head(5))
print(train_df.info())
print('test dataframe : ',test_df.head(5))
print(test_df.info())

In [None]:
print(len(train_df.index))
print(len(test_df.index))

In [None]:
train_df_copy = train_df
train_df = train_df.drop('target', axis = 1)
frames = [train_df,test_df]
train_df = pd.concat(frames)

In [None]:
# Converting everything in Lower case
train_df['lowered_text'] = train_df['text'].str.lower()
print(train_df['lowered_text'].head(3))

In [None]:
# Removing punctuation
punctuation=string.punctuation
mapping=str.maketrans("","",punctuation)

def remove_punctuation(in_str):
    return in_str.translate(mapping)

print(train_df['lowered_text'].head(10))   
train_df['lowered_text']=train_df["lowered_text"].apply(lambda x: remove_punctuation(x))
print(train_df['lowered_text'].head(10)) 

In [None]:
# Removing Stop words
from nltk.corpus import stopwords
stopwords_eng=stopwords.words('english')

print(train_df["lowered_text"].head(10)) #before

def remove_stopwords(in_str):
    new_str=''
    words=in_str.split()
    for tx in words:
        if tx not in stopwords_eng:
            new_str=new_str + tx + " "
    return new_str

train_df['lowered_text_stop_removed']=train_df["lowered_text"].apply(lambda x: remove_stopwords(x))
print(train_df["lowered_text_stop_removed"].head(10))

In [None]:
# Removing most frequent 10 words
from collections import Counter
counter=Counter()
for text in train_df["lowered_text_stop_removed"]:
    for word in text.split():
        counter[word]+=1
most_cmn_list=counter.most_common(10)
print(type(most_cmn_list), most_cmn_list)
most_cmn_words_list=[]
for word, freq in most_cmn_list:
    most_cmn_words_list.append(word)
print('Most common words : ', most_cmn_words_list)

def remove_frequent(in_str):
    new_str=''
    for word in in_str.split():
        if word not in most_cmn_words_list:
            new_str=new_str + word + " "
    return new_str

train_df["lowered_text_stop_removed_freq_removed"]=train_df['lowered_text_stop_removed'].apply(lambda x: remove_frequent(x))

In [None]:
# Removing 10 most rare words
most_rare_list=counter.most_common()[-10:]
most_rare_words=[]
for word, freq in most_rare_list:
    most_rare_words.append(word)
print('Most rare words : ',most_rare_words)

def remove_rare(in_text):
    new_text=""
    for word in in_text.split():
        if word not in most_rare_words:
            new_text=new_text + word + " "
    return new_text

train_df["lowered_stop_freq_rare_removed"]=train_df["lowered_text_stop_removed_freq_removed"].apply(lambda x: remove_rare(x))

In [None]:
# Lemmatization [IGNORE... SKIP TO Lemmatization with POS]

from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

def do_lemmatizing(in_str):
    new_str=""
    for word in in_str.split():
        new_str=new_str + lem.lemmatize(word) + " "
    return new_str

train_df["Lemmatized"]=train_df["lowered_stop_freq_rare_removed"].apply(lambda x: do_lemmatizing(x))

In [None]:
# Removing URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
def remove_html(in_str):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', in_str)

train_df["urls_removed"]=train_df["Lemmatized"].apply(lambda x: remove_urls(x))
train_df["html_removed"]=train_df["urls_removed"].apply(lambda x: remove_html(x))


In [None]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

chat_words_expanded_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        chat_word = line.split("=")[0]
        chat_word_expanded = line.split("=")[1]
        chat_words_list.append(chat_word)
        chat_words_expanded_dict[chat_word] = chat_word_expanded
chat_words_list = set(chat_words_list)

def convert_chat_words(in_str):
    new_str = ""
    for w in in_str.split():
        if w.upper() in chat_words_list:
            new_str = new_str + chat_words_expanded_dict[w.upper()] + " "
        else:
            new_str = new_str + w + " "
    return new_str

train_df["chat_words_coverted"]=train_df["html_removed"].apply(lambda x: convert_chat_words(x))

In [None]:
!pip install pyspellchecker

In [None]:
# Spelling Correction

# from spellchecker import SpellChecker

# spell = SpellChecker()

# def correct_spellings(in_str):
#     new_str = ""
#     misspelled_words = spell.unknown(in_str.split())
#     for word in in_str.split():
#         if word in misspelled_words:
#             new_str = new_str + spell.correction(word) + " "
#         else:
#             new_str = new_str + word + " "
#     return new_str

# train_df["spellings_corrected"]=train_df["chat_words_coverted"].apply(lambda x: correct_spellings(x))

In [None]:
train_df["spellings_corrected"]=train_df["chat_words_coverted"]

In [None]:
print(test_df.shape)
print(train_df.shape)
print(train_df_copy.shape)

In [None]:
test_df = train_df.iloc[7613:,:]
train_df = train_df.iloc[:7613,:]
train_df['target'] = train_df_copy['target'].values
# 7613
# 3263
train_df.head(5)

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [None]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

In [None]:
print(train_vectors)
print(test_vectors)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(train_vectors, train_df["target"])

In [None]:
sample_submission["target"] = clf_lr.predict(test_vectors)
sample_submission.to_csv("submission.csv", index=False)