## Step 1. EDA

In [337]:
import pandas as pd

In [338]:
df = pd.read_csv('data/train.csv',nrows=10)

In [339]:
pd.set_option('display.max_colwidth', None)

In [340]:
#df

In [341]:
import spacy

In [342]:
nlp = spacy.load('en_core_web_sm') # Model needs to downloaded: python -m spacy download en_core_web_sm 

In [343]:
def lower_replace(series):
    res = series.str.lower()
    res = res.str.replace(r'[^\w\s]', '', regex=True)
    res = res.str.replace(r'\d','', regex=True)
    res = res.str.replace('\n', ' ', regex=False)
    res = res.str.replace('\t', ' ', regex=False)
    res = res.str.replace('\r', ' ', regex=False)
    res = res.str.replace(r'\s+', ' ', regex=True)
    return res
    
def tokenize_lemmatize(text, remove_stop=True):
    doc = nlp(text)
    norm = []
    if remove_stop:
        norm = [token.lemma_ for token in doc if not token.is_stop and (len(token.text) > 1 or token.text=='i')]
    else:
        norm = [token.lemma_ for token in doc if len(token.text) > 1 or token.text=='i']
    return ' '.join(norm)

In [344]:
def normalize_text(series, remove_stop_words=True):
    res = lower_replace(series)
    res = res.apply(tokenize_lemmatize, remove_stop=remove_stop_words)
    return res

In [345]:
df.insert(loc=2,column='text_clean_nostop',value=normalize_text(df.comment_text))
df.insert(loc=3,column='text_clean_withstop',value=normalize_text(df.comment_text, remove_stop_words=False))

In [346]:
df.head(5)

Unnamed: 0,id,comment_text,text_clean_nostop,text_clean_withstop,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",explanation edit username hardcore metallica fan revert not vandalism closure gas vote new york doll fac not remove template talk page retire,explanation why the edit make under my username hardcore metallica fan be revert they be not vandalism just closure on some gas after I vote at new york doll fac and please do not remove the template from the talk page since I retire now,0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",daww match background colour seemingly stick thank talk january utc,daww he match this background colour I seemingly stick with thank talk january utc,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",hey man try edit war guy constantly remove relevant information talk edit instead talk page care formatting actual info,hey man I really not try to edit war its just that this guy be constantly remove relevant information and talk to I through edit instead of my talk page he seem to care more about the formatting than the actual info,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,not real suggestion improvement wonder section statistic later subsection type accident think reference need tidy exact format ie date format etc later preference formatting style reference want let know appear backlog article review guess delay reviewer turn list relevant form eg wikipediagood_article_nominationstransport,more I can not make any real suggestion on improvement I wonder if the section statistic should be later on or subsection of type of accident I think the reference may need tidy so that they be all in the exact same format ie date format etc I can do that later on if noone else do first if you have any preference for formatting style on reference or want to do it yourself please let I know there appear to be backlog on article for review so I guess there may be delay until reviewer turn up its list in the relevant form eg wikipediagood_article_nominationstransport,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",sir hero chance remember page,you sir be my hero any chance you remember what page that on,0,0,0,0,0,0


In [347]:
pd.to_pickle(df.text_clean_nostop, 'text_clean_nostop.pkl')
pd.to_pickle(df.text_clean_withstop, 'text_clean_withstop.pkl')

## Step 2. Vectorization

In [348]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [349]:
def create_tfidf_df(series, ngram_range=(1,2)):
    tv = TfidfVectorizer(ngram_range=ngram_range)
    tfidf = tv.fit_transform(series)
    tfidf_df = pd.DataFrame(tfidf.toarray(), columns=tv.get_feature_names_out())
    return tfidf_df

In [350]:
tfidf_nostop_df = create_tfidf_df(df.text_clean_nostop,ngram_range=(1,2))

tfidf_withstop_df = create_tfidf_df(df.text_clean_withstop, ngram_range=(1,2))

In [351]:
tfidf_nostop_df


Unnamed: 0,accident,accident think,actual,actual info,alignment,alignment subject,animal,animal breeding,appear,appear backlog,...,wikipediagood_article_nominationstransport,wonder,wonder section,word,word nonsense,work,write,write articlewow,york,york doll
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157615,0.157615
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.167389,0.167389,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.107557,0.107557,0.0,0.0,0.0,0.0,0.0,0.0,0.107557,0.107557,...,0.107557,0.107557,0.107557,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.13279,0.13279,0.0,0.0,...,0.0,0.0,0.0,0.13279,0.13279,0.0,0.13279,0.13279,0.0,0.0
9,0.0,0.0,0.0,0.0,0.377964,0.377964,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [352]:
tfidf_withstop_df

Unnamed: 0,about,about the,accident,accident think,actual,actual info,after,after vote,again,again or,...,you have,you no,you piss,you remember,you sir,you will,your,your vandalism,yourself,yourself please
0,0.0,0.0,0.0,0.0,0.0,0.0,0.115263,0.115263,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.118104,0.118104,0.0,0.0,0.118104,0.118104,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.067978,0.067978,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067978,0.067978
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.218371,0.218371,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.279501,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17393,0.17393,...,0.0,0.0,0.0,0.0,0.0,0.17393,0.17393,0.17393,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.082463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
