# Import and setup pandas dataframe

In [1]:
# Pandas is used for data manipulation
import numpy as np
import pandas as pd
import contractions
import fasttext
import string
fasttext.FastText.eprint = lambda x: None

# NLTK tokenization, stopword removal and lemmatization
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
# Read in data and display first 10 rows
pd.set_option("display.max_colwidth", 10000)
df = pd.read_csv('src/2018_pinkbike_comments.csv')
df.head(10)

Unnamed: 0,id,article_id,comment_author_id,comment_html_id,comment_publishing_date,comment_upvotes,comment_downvotes,comment_content
0,1,1,1,1898605,1517781780,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source."
1,2,1,2,1896303,1517802540,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!
2,3,1,3,1897961,1517441160,1,0,难得在国际版看到中国内容
3,4,1,2,1896280,1517457120,1,0,我希望能写更多有关中国单车的内容
4,5,1,4,1896452,1517596320,1,0,来多几段，这么多的活动没人报道
5,6,3,7,1896246,1517424960,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.
6,7,3,8,1896306,1517441280,18,1,I had literally never considered that they might be brothers. This is embarrassing.
7,8,3,9,1896267,1517444640,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that..."
8,9,3,10,1896260,1517459940,1,0,@bonkywonky: Makes two of dude!
9,10,3,11,1896288,1517460540,3,0,"I have to say, to my shame, that before this video I never even realized Matt had a sibling at all, especially not a twin... :O"


In [3]:
df.drop("id", axis = 1, inplace = True)

In [4]:
print (list(df))

['article_id', 'comment_author_id', 'comment_html_id', 'comment_publishing_date', 'comment_upvotes', 'comment_downvotes', 'comment_content']


In [5]:
for col in df.columns:
    print(col, df[col].isna().sum())

article_id 0
comment_author_id 0
comment_html_id 0
comment_publishing_date 0
comment_upvotes 0
comment_downvotes 0
comment_content 26


In [6]:
df.dropna(axis = 0, inplace = True)

In [7]:
cmnts = df.loc[:, ["comment_upvotes", "comment_downvotes", "comment_content"]]

# Text preprocessing

## Expanding shortforms

In [8]:
cmnts["no_contract"] = cmnts['comment_content'].apply(lambda x: [contractions.fix(word) for word in x.split()])
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]"
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]"
2,1,0,难得在国际版看到中国内容,[难得在国际版看到中国内容]
3,1,0,我希望能写更多有关中国单车的内容,[我希望能写更多有关中国单车的内容]
4,1,0,来多几段，这么多的活动没人报道,[来多几段，这么多的活动没人报道]


In [9]:
cmnts["comment_content_str"] = [' '.join(map(str, l)) for l in cmnts['no_contract']]
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source."
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!
2,1,0,难得在国际版看到中国内容,[难得在国际版看到中国内容],难得在国际版看到中国内容
3,1,0,我希望能写更多有关中国单车的内容,[我希望能写更多有关中国单车的内容],我希望能写更多有关中国单车的内容
4,1,0,来多几段，这么多的活动没人报道,[来多几段，这么多的活动没人报道],来多几段，这么多的活动没人报道


## Detect language and delete non-english comments

In [10]:
pretrained_model = "src/lid.176.bin"
model = fasttext.load_model(pretrained_model)
langs = []

for sent in cmnts["comment_content_str"]:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
cmnts["langs"] = langs
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en
2,1,0,难得在国际版看到中国内容,[难得在国际版看到中国内容],难得在国际版看到中国内容,zh
3,1,0,我希望能写更多有关中国单车的内容,[我希望能写更多有关中国单车的内容],我希望能写更多有关中国单车的内容,zh
4,1,0,来多几段，这么多的活动没人报道,[来多几段，这么多的活动没人报道],来多几段，这么多的活动没人报道,zh


In [11]:
new_cmnts = cmnts[~cmnts["langs"].str.contains("en", na = False)]
new_cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs
2,1,0,难得在国际版看到中国内容,[难得在国际版看到中国内容],难得在国际版看到中国内容,zh
3,1,0,我希望能写更多有关中国单车的内容,[我希望能写更多有关中国单车的内容],我希望能写更多有关中国单车的内容,zh
4,1,0,来多几段，这么多的活动没人报道,[来多几段，这么多的活动没人报道],来多几段，这么多的活动没人报道,zh
18,3,0,Oi Oi tobi !,"[Oi, Oi, tobi, !]",Oi Oi tobi !,fr
20,2,0,soooo gut,"[soooo, gut]",soooo gut,de


In [12]:
# Get indexes of foreign languages
indexNames = new_cmnts.index 

# Delete these row indexes from dataFrame
cmnts.drop(indexNames , inplace = True)
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en


## All characters lowercase

In [13]:
cmnts['lower'] = cmnts['comment_content_str'].str.lower()
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source."
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that..."


## Remove URLs

In [14]:
cmnts['no_url'] = cmnts['lower'].replace(r'http\S+', '', regex = True).replace(r'www\S+', '', regex = True)
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source."
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that..."


## Remove numbers

In [15]:
cmnts['no_numb'] = cmnts['no_url'].str.replace(r'\d+','', regex = True)
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url,no_numb
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: - month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source."
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that..."


## Removing punctuation

In [16]:
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    return s

# remove punctuation
cmnts['no_punc'] = cmnts['no_numb'].apply(remove_punctuation)
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url,no_numb,no_punc
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: - month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.",hey man note month is a forest fire prevention season in kunming and some forest trails may not be allowed to enterbut some forest trails are still allowed to enter you can not use a fire source
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop yep i know about the fire prevention thankfully they do not close off bao zhu let us ride sometime
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones video frames of mind video was one of the best from last year
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers this is embarrassing
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...",i actually thought for a while it was the same person jono being the nickname or something like that


## Tokenization

In [17]:
cmnts['tokenized'] = cmnts['no_punc'].apply(word_tokenize)
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url,no_numb,no_punc,tokenized
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: - month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.",hey man note month is a forest fire prevention season in kunming and some forest trails may not be allowed to enterbut some forest trails are still allowed to enter you can not use a fire source,"[hey, man, note, month, is, a, forest, fire, prevention, season, in, kunming, and, some, forest, trails, may, not, be, allowed, to, enterbut, some, forest, trails, are, still, allowed, to, enter, you, can, not, use, a, fire, source]"
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop yep i know about the fire prevention thankfully they do not close off bao zhu let us ride sometime,"[hey, hoop, yep, i, know, about, the, fire, prevention, thankfully, they, do, not, close, off, bao, zhu, let, us, ride, sometime]"
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones video frames of mind video was one of the best from last year,"[matt, jones, video, frames, of, mind, video, was, one, of, the, best, from, last, year]"
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers this is embarrassing,"[i, had, literally, never, considered, that, they, might, be, brothers, this, is, embarrassing]"
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...",i actually thought for a while it was the same person jono being the nickname or something like that,"[i, actually, thought, for, a, while, it, was, the, same, person, jono, being, the, nickname, or, something, like, that]"


## Remove stopwords

In [18]:
stop_words = set(stopwords.words('english'))
cmnts['stopwords_removed'] = cmnts['tokenized'].apply(lambda x: [word for word in x if word not in stop_words])
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url,no_numb,no_punc,tokenized,stopwords_removed
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: - month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.",hey man note month is a forest fire prevention season in kunming and some forest trails may not be allowed to enterbut some forest trails are still allowed to enter you can not use a fire source,"[hey, man, note, month, is, a, forest, fire, prevention, season, in, kunming, and, some, forest, trails, may, not, be, allowed, to, enterbut, some, forest, trails, are, still, allowed, to, enter, you, can, not, use, a, fire, source]","[hey, man, note, month, forest, fire, prevention, season, kunming, forest, trails, may, allowed, enterbut, forest, trails, still, allowed, enter, use, fire, source]"
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop yep i know about the fire prevention thankfully they do not close off bao zhu let us ride sometime,"[hey, hoop, yep, i, know, about, the, fire, prevention, thankfully, they, do, not, close, off, bao, zhu, let, us, ride, sometime]","[hey, hoop, yep, know, fire, prevention, thankfully, close, bao, zhu, let, us, ride, sometime]"
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones video frames of mind video was one of the best from last year,"[matt, jones, video, frames, of, mind, video, was, one, of, the, best, from, last, year]","[matt, jones, video, frames, mind, video, one, best, last, year]"
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers this is embarrassing,"[i, had, literally, never, considered, that, they, might, be, brothers, this, is, embarrassing]","[literally, never, considered, might, brothers, embarrassing]"
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...",i actually thought for a while it was the same person jono being the nickname or something like that,"[i, actually, thought, for, a, while, it, was, the, same, person, jono, being, the, nickname, or, something, like, that]","[actually, thought, person, jono, nickname, something, like]"


## Lemmatization

In [None]:
cmnts['pos_tags'] = cmnts['stopwords_removed'].apply(nltk.tag.pos_tag)
cmnts.head()

In [20]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

cmnts['wordnet_pos'] = cmnts['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url,no_numb,no_punc,tokenized,stopwords_removed,pos_tags,wordnet_pos
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: - month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.",hey man note month is a forest fire prevention season in kunming and some forest trails may not be allowed to enterbut some forest trails are still allowed to enter you can not use a fire source,"[hey, man, note, month, is, a, forest, fire, prevention, season, in, kunming, and, some, forest, trails, may, not, be, allowed, to, enterbut, some, forest, trails, are, still, allowed, to, enter, you, can, not, use, a, fire, source]","[hey, man, note, month, forest, fire, prevention, season, kunming, forest, trails, may, allowed, enterbut, forest, trails, still, allowed, enter, use, fire, source]","[(hey, NN), (man, NN), (note, VBP), (month, NN), (forest, JJS), (fire, NN), (prevention, NN), (season, NN), (kunming, VBG), (forest, JJS), (trails, NNS), (may, MD), (allowed, VB), (enterbut, NN), (forest, JJS), (trails, NNS), (still, RB), (allowed, VBN), (enter, NN), (use, NN), (fire, NN), (source, NN)]","[(hey, n), (man, n), (note, v), (month, n), (forest, a), (fire, n), (prevention, n), (season, n), (kunming, v), (forest, a), (trails, n), (may, n), (allowed, v), (enterbut, n), (forest, a), (trails, n), (still, r), (allowed, v), (enter, n), (use, n), (fire, n), (source, n)]"
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop yep i know about the fire prevention thankfully they do not close off bao zhu let us ride sometime,"[hey, hoop, yep, i, know, about, the, fire, prevention, thankfully, they, do, not, close, off, bao, zhu, let, us, ride, sometime]","[hey, hoop, yep, know, fire, prevention, thankfully, close, bao, zhu, let, us, ride, sometime]","[(hey, NN), (hoop, VBD), (yep, RB), (know, JJ), (fire, NN), (prevention, NN), (thankfully, RB), (close, JJ), (bao, NNS), (zhu, VBP), (let, VB), (us, PRP), (ride, VB), (sometime, RB)]","[(hey, n), (hoop, v), (yep, r), (know, a), (fire, n), (prevention, n), (thankfully, r), (close, a), (bao, n), (zhu, v), (let, v), (us, n), (ride, v), (sometime, r)]"
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones video frames of mind video was one of the best from last year,"[matt, jones, video, frames, of, mind, video, was, one, of, the, best, from, last, year]","[matt, jones, video, frames, mind, video, one, best, last, year]","[(matt, NN), (jones, NNS), (video, VBP), (frames, NNS), (mind, VBP), (video, JJ), (one, CD), (best, JJS), (last, JJ), (year, NN)]","[(matt, n), (jones, n), (video, v), (frames, n), (mind, v), (video, a), (one, n), (best, a), (last, a), (year, n)]"
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers this is embarrassing,"[i, had, literally, never, considered, that, they, might, be, brothers, this, is, embarrassing]","[literally, never, considered, might, brothers, embarrassing]","[(literally, RB), (never, RB), (considered, VBN), (might, MD), (brothers, NNS), (embarrassing, VBG)]","[(literally, r), (never, r), (considered, v), (might, n), (brothers, n), (embarrassing, v)]"
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...",i actually thought for a while it was the same person jono being the nickname or something like that,"[i, actually, thought, for, a, while, it, was, the, same, person, jono, being, the, nickname, or, something, like, that]","[actually, thought, person, jono, nickname, something, like]","[(actually, RB), (thought, VBN), (person, NN), (jono, NN), (nickname, NN), (something, NN), (like, IN)]","[(actually, r), (thought, v), (person, n), (jono, n), (nickname, n), (something, n), (like, n)]"


In [21]:
wnl = WordNetLemmatizer()
cmnts['lemmatized'] = cmnts['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
cmnts.head()

Unnamed: 0,comment_upvotes,comment_downvotes,comment_content,no_contract,comment_content_str,langs,lower,no_url,no_numb,no_punc,tokenized,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can't use a fire source.","[Hey, man., Note:, 1-6, month, is, a, forest, fire, prevention, season, in, Kunming,, and, some, forest, trails, may, not, be, allowed, to, enter.But, some, forest, trails, are, still, allowed, to, enter., You, can not, use, a, fire, source.]","Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You can not use a fire source.",en,"hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: 1-6 month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.","hey man. note: - month is a forest fire prevention season in kunming, and some forest trails may not be allowed to enter.but some forest trails are still allowed to enter. you can not use a fire source.",hey man note month is a forest fire prevention season in kunming and some forest trails may not be allowed to enterbut some forest trails are still allowed to enter you can not use a fire source,"[hey, man, note, month, is, a, forest, fire, prevention, season, in, kunming, and, some, forest, trails, may, not, be, allowed, to, enterbut, some, forest, trails, are, still, allowed, to, enter, you, can, not, use, a, fire, source]","[hey, man, note, month, forest, fire, prevention, season, kunming, forest, trails, may, allowed, enterbut, forest, trails, still, allowed, enter, use, fire, source]","[(hey, NN), (man, NN), (note, VBP), (month, NN), (forest, JJS), (fire, NN), (prevention, NN), (season, NN), (kunming, VBG), (forest, JJS), (trails, NNS), (may, MD), (allowed, VB), (enterbut, NN), (forest, JJS), (trails, NNS), (still, RB), (allowed, VBN), (enter, NN), (use, NN), (fire, NN), (source, NN)]","[(hey, n), (man, n), (note, v), (month, n), (forest, a), (fire, n), (prevention, n), (season, n), (kunming, v), (forest, a), (trails, n), (may, n), (allowed, v), (enterbut, n), (forest, a), (trails, n), (still, r), (allowed, v), (enter, n), (use, n), (fire, n), (source, n)]","[hey, man, note, month, forest, fire, prevention, season, kunming, forest, trail, may, allow, enterbut, forest, trail, still, allow, enter, use, fire, source]"
1,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they don't close off Bao Zhu. Let's ride sometime!,"[Hey, hoop., Yep, I, know, about, the, fire, prevention., Thankfully, they, do not, close, off, Bao, Zhu., let us, ride, sometime!]",Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,en,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop. yep i know about the fire prevention. thankfully they do not close off bao zhu. let us ride sometime!,hey hoop yep i know about the fire prevention thankfully they do not close off bao zhu let us ride sometime,"[hey, hoop, yep, i, know, about, the, fire, prevention, thankfully, they, do, not, close, off, bao, zhu, let, us, ride, sometime]","[hey, hoop, yep, know, fire, prevention, thankfully, close, bao, zhu, let, us, ride, sometime]","[(hey, NN), (hoop, VBD), (yep, RB), (know, JJ), (fire, NN), (prevention, NN), (thankfully, RB), (close, JJ), (bao, NNS), (zhu, VBP), (let, VB), (us, PRP), (ride, VB), (sometime, RB)]","[(hey, n), (hoop, v), (yep, r), (know, a), (fire, n), (prevention, n), (thankfully, r), (close, a), (bao, n), (zhu, v), (let, v), (us, n), (ride, v), (sometime, r)]","[hey, hoop, yep, know, fire, prevention, thankfully, close, bao, zhu, let, u, ride, sometime]"
5,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,"[Matt, Jones', video, Frames, of, Mind, video, was, one, of, the, best, from, last, year.]",Matt Jones' video Frames of Mind video was one of the best from last year.,en,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones' video frames of mind video was one of the best from last year.,matt jones video frames of mind video was one of the best from last year,"[matt, jones, video, frames, of, mind, video, was, one, of, the, best, from, last, year]","[matt, jones, video, frames, mind, video, one, best, last, year]","[(matt, NN), (jones, NNS), (video, VBP), (frames, NNS), (mind, VBP), (video, JJ), (one, CD), (best, JJS), (last, JJ), (year, NN)]","[(matt, n), (jones, n), (video, v), (frames, n), (mind, v), (video, a), (one, n), (best, a), (last, a), (year, n)]","[matt, jones, video, frame, mind, video, one, best, last, year]"
6,18,1,I had literally never considered that they might be brothers. This is embarrassing.,"[I, had, literally, never, considered, that, they, might, be, brothers., This, is, embarrassing.]",I had literally never considered that they might be brothers. This is embarrassing.,en,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers. this is embarrassing.,i had literally never considered that they might be brothers this is embarrassing,"[i, had, literally, never, considered, that, they, might, be, brothers, this, is, embarrassing]","[literally, never, considered, might, brothers, embarrassing]","[(literally, RB), (never, RB), (considered, VBN), (might, MD), (brothers, NNS), (embarrassing, VBG)]","[(literally, r), (never, r), (considered, v), (might, n), (brothers, n), (embarrassing, v)]","[literally, never, consider, might, brother, embarrass]"
7,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...","[I, actually, thought, for, a, while, it, was, the, same, person,, Jono, being, the, nickname, or, something, like, that...]","I actually thought for a while it was the same person, Jono being the nickname or something like that...",en,"i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...","i actually thought for a while it was the same person, jono being the nickname or something like that...",i actually thought for a while it was the same person jono being the nickname or something like that,"[i, actually, thought, for, a, while, it, was, the, same, person, jono, being, the, nickname, or, something, like, that]","[actually, thought, person, jono, nickname, something, like]","[(actually, RB), (thought, VBN), (person, NN), (jono, NN), (nickname, NN), (something, NN), (like, IN)]","[(actually, r), (thought, v), (person, n), (jono, n), (nickname, n), (something, n), (like, n)]","[actually, think, person, jono, nickname, something, like]"


## Save as clean CSV

In [22]:
cmnts_clean = cmnts.loc[:, ["comment_upvotes", "comment_downvotes", "lemmatized"]]
cmnts_clean.head()

Unnamed: 0,comment_upvotes,comment_downvotes,lemmatized
0,1,0,"[hey, man, note, month, forest, fire, prevention, season, kunming, forest, trail, may, allow, enterbut, forest, trail, still, allow, enter, use, fire, source]"
1,1,0,"[hey, hoop, yep, know, fire, prevention, thankfully, close, bao, zhu, let, u, ride, sometime]"
5,30,0,"[matt, jones, video, frame, mind, video, one, best, last, year]"
6,18,1,"[literally, never, consider, might, brother, embarrass]"
7,15,0,"[actually, think, person, jono, nickname, something, like]"


In [23]:
cmnts_clean.to_csv('src/YYYY_clean_pinkbike_comments.csv')