## Importing datasets

In [3]:
! gdown --id 1jwpMTJ0vfHLjmvoktxtCYJATlNpfRfo4


Downloading...
From: https://drive.google.com/uc?id=1jwpMTJ0vfHLjmvoktxtCYJATlNpfRfo4
To: /content/test-balanced.csv
  0% 0.00/923k [00:00<?, ?B/s]100% 923k/923k [00:00<00:00, 60.0MB/s]


In [4]:
! gdown --id 1JqRiJyv_BBz6BnpUkg4Xzb96-l08FSfS

Downloading...
From: https://drive.google.com/uc?id=1JqRiJyv_BBz6BnpUkg4Xzb96-l08FSfS
To: /content/test-unbalanced.csv
100% 17.4M/17.4M [00:00<00:00, 106MB/s]


In [5]:
! gdown --id 1U01uPbghi2_OlqNoDyG_fnnFDf0vjIsL

Downloading...
From: https://drive.google.com/uc?id=1U01uPbghi2_OlqNoDyG_fnnFDf0vjIsL
To: /content/train-balanced-sarcasm.csv
100% 255M/255M [00:01<00:00, 147MB/s]


In [6]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

## Loading data

In [7]:
df_train_balanced = pd.read_csv("/content/train-balanced-sarcasm.csv")
df_test_balanced = pd.read_csv("/content/test-balanced.csv")
df_test_unbalanced = pd.read_csv("/content/test-unbalanced.csv")

### **Sample of training data (balanced)**

In [8]:
df_train_balanced.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


### **Sample of testing data (balanced)**

In [9]:
df_test_balanced.head()

Unnamed: 0,7x7dx,c07nkao c07nk63,1 0
0,7xtxk,c07pji4 c07ppds,1 0
1,7zh5l,c07twag c07twp2,1 0
2,80nmo,c07y1rj c07xhyn,0 1
3,8139p,c07yhlm c07yoiw,1 0
4,bnmod,c0nnujf c0nofrs,1 0


### **Sample of testing data (unbalanced)**

In [10]:
df_test_unbalanced.head()

Unnamed: 0,7u1ht,c07em3g,1
0,7u92p,c07f8sq c07flil c07fgh5 c07f9ck c07fn0s c07fl0...,0 0 0 0 0 0 0 0 0 0 0
1,7vvpw,c07khcb c07jx0k c07jzxi c07k0t3 c07jx2j,0 0 0 0 0
2,7vv27 c07ju7y,c07jub3,1
3,7wco4,c07l718 c07lfcd c07l7zu,0 0 0
4,7xgdr,c07o9qu,1


## Data Preprocessing

### **Tokenisation and removing stopwords**

In [11]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
porter_stemmer = PorterStemmer()
word_tokenizer = TreebankWordTokenizer()
word_tokenizer2 = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()


def create_stopwords_dict():
  stopwords_dict = {}
  for word in set(stopwords.words('english')):
    stopwords_dict[word] = True
  return stopwords_dict

stop_words_dict = create_stopwords_dict()
print(stop_words_dict)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
{'a': True, 'mustn': True, 'he': True, 'all': True, 'ourselves': True, 'whom': True, 'over': True, 'during': True, 'most': True, 'itself': True, 'is': True, 'the': True, 'hers': True, 'any': True, 'ours': True, 'her': True, 'few': True, 'just': True, 'no': True, 'its': True, 'they': True, 're': True, 'those': True, 'for': True, 'are': True, 'mightn': True, 'doesn': True, 'couldn': True, 'same': True, 'm': True, 'haven': True, 'because': True, 'when': True, 'that': True, 'other': True, 'too': True, 'his': True, 'under': True, "you'll": True, 'their': True, 'll': True, "haven't": True, 'yours': True, "shan't": True, 'we': True, 'shan': True, 'shouldn': True, 'my': True, 'wouldn': True, 'our': True, 'd': True, 'who': True, 'don': True, 'himself': True, 'you': True, 'further': True, 'up': True, 'each': True, "wasn't": True, 'should': True, 'there': True, 'have': True, 'then': True

In [12]:
def remove_stopwords_and_tokenize(text):
  arr = word_tokenizer.tokenize(text)
  arr = [word for word in arr if word not in stop_words_dict]
  
  return arr

df_train_balanced.dropna(subset=['comment'], inplace=True)
df_train_balanced['comment'] = df_train_balanced['comment'].apply(lambda x: remove_stopwords_and_tokenize(x))
df_train_balanced['parent_comment'] = df_train_balanced['parent_comment'].apply(lambda x: remove_stopwords_and_tokenize(x))

In [13]:
df_train_balanced.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,"[NC, NH, .]",Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"[Yeah, ,, I, get, argument., At, point, ,, I, ..."
1,0,"[You, know, west, teams, play, west, teams, ea...",Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,"[The, blazers, Mavericks, (, The, wests, 5, 6,..."
2,0,"[They, underdogs, earlier, today, ,, since, Gr...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,"[They, 're, favored, win, .]"
3,0,"[This, meme, n't, funny, none, ``, new, york, ...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,"[deadass, n't, kill, buzz]"
4,0,"[I, could, use, one, tools, .]",cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,"[Yep, confirm, I, saw, tool, use, that., It, m..."


In [None]:
## Store number of sarcastic, non sarcastic, and total comments made by an author in the past
## First filter and split the dataset into sarcastic comments and non-sarcastic comments

df_train_balanced_sarcastic = df_train_balanced[df_train_balanced['label'] == 1]
df_train_balanced_unsarcastic = df_train_balanced[df_train_balanced['label'] == 0]

## Ensure dataset is balanced
print("Sarcastic size ", df_train_balanced_sarcastic.size)
print("Unsarcastic size ", df_train_balanced_unsarcastic.size)

In [14]:
df_train_balanced['num_comment_words'] = df_train_balanced['comment'].apply(lambda x: len(x))
df_train_balanced['num_parent_comment_words'] = df_train_balanced['parent_comment'].apply(lambda x: len(x))
df_train_balanced.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,num_comment_words,num_parent_comment_words
0,0,"[NC, NH, .]",Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"[Yeah, ,, I, get, argument., At, point, ,, I, ...",3,15
1,0,"[You, know, west, teams, play, west, teams, ea...",Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,"[The, blazers, Mavericks, (, The, wests, 5, 6,...",11,21
2,0,"[They, underdogs, earlier, today, ,, since, Gr...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,"[They, 're, favored, win, .]",16,5
3,0,"[This, meme, n't, funny, none, ``, new, york, ...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,"[deadass, n't, kill, buzz]",12,4
4,0,"[I, could, use, one, tools, .]",cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,"[Yep, confirm, I, saw, tool, use, that., It, m...",6,11


In [25]:
def feature_history(col):
  col_sarcasm_history = {}
  col_non_sarcasm_history = {}
  col_sarcasm_proportion_history = {}
  num_comments = {}
  for val in list(df_train_balanced_sarcastic[col]):
    if (val not in col_sarcasm_history):
      col_sarcasm_history[val] = 1
    else:
      col_sarcasm_history[val] += 1

    if (val not in col_non_sarcasm_history):
      col_non_sarcasm_history[val] = 1

  for val in list(df_train_balanced_unsarcastic[col]):
    if (val not in col_sarcasm_history):
      col_non_sarcasm_history[val] = 1
    else:
      col_non_sarcasm_history[val] += 1
    
    if (val not in col_sarcasm_history):
      col_sarcasm_history[val] = 0;

  for (val, num_sarcastic_comments) in col_sarcasm_history.items():
    sarcasm_proportion = col_sarcasm_history[val]/(col_sarcasm_history[val] + col_non_sarcasm_history[val])
    num_comments[val] = (col_sarcasm_history[val] + col_non_sarcasm_history[val])
    col_sarcasm_proportion_history[val] = sarcasm_proportion

  num_comments_values = num_comments.values()
  max_num_comments = max(num_comments_values)
  min_num_comments = min(num_comments_values)

  ## Normalization
  for val in num_comments.keys():
    num_comments[val] = (num_comments[val] - min_num_comments)/(max_num_comments - min_num_comments)

  return col_sarcasm_proportion_history, num_comments

author_sarcasm_proportion_history, num_comments_by_author = feature_history("author")
df_train_balanced["author_history"] = df_train_balanced["author"].apply(lambda x: author_sarcasm_proportion_history[x])
df_train_balanced["num_comments_by_author"] = df_train_balanced["author"].apply(lambda x: num_comments_by_author[x])
print("Number of authors ", len(num_comments_by_author.keys()))

Number of authors  256560


In [26]:
num_unique_subreddits = df_train_balanced['subreddit'].nunique()
num_cols_train_balanced = df_train_balanced.size
num_comments_per_subreddit = num_cols_train_balanced/num_unique_subreddits
num_comments_per_subreddit

subreddit_sarcasm_proportion_history, num_comments_by_subreddit = feature_history("subreddit")
df_train_balanced["author_history"] = df_train_balanced["subreddit"].apply(lambda x: subreddit_sarcasm_proportion_history[x])
df_train_balanced["num_comments_by_author"] = df_train_balanced["subreddit"].apply(lambda x: num_comments_by_subreddit[x])
print("Number of subreddits ", len(num_comments_by_subreddit.keys()))

Number of subreddits  14876


In [27]:
df_train_balanced.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,num_comment_words,num_parent_comment_words,author_history,num_comments_by_author
0,0,"[NC, NH, .]",Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"[Yeah, ,, I, get, argument., At, point, ,, I, ...",3,15,0.605332,0.601349
1,0,"[You, know, west, teams, play, west, teams, ea...",Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,"[The, blazers, Mavericks, (, The, wests, 5, 6,...",11,21,0.526472,0.215397
2,0,"[They, underdogs, earlier, today, ,, since, Gr...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,"[They, 're, favored, win, .]",16,5,0.509823,0.215443
3,0,"[This, meme, n't, funny, none, ``, new, york, ...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,"[deadass, n't, kill, buzz]",12,4,0.379367,0.02788
4,0,"[I, could, use, one, tools, .]",cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,"[Yep, confirm, I, saw, tool, use, that., It, m...",6,11,0.50319,0.019079


In [38]:
model = Word2Vec(list(df_train_balanced['comment']), min_count=1,size= 100,workers=3, window =3, sg = 1)
print(model)

Word2Vec(vocab=286603, size=100, alpha=0.025)


In [39]:
model['lmao']

  """Entry point for launching an IPython kernel.


array([-0.2790526 ,  0.3267132 ,  0.16707304, -0.20092127,  0.27449945,
        0.11265453, -0.33591047,  0.6426989 , -0.03416266, -0.23984823,
        0.16321829,  0.25911152,  0.4184589 ,  0.77843827, -0.16694531,
       -0.19487874, -0.04001873, -0.14829214,  0.28929943, -0.01079083,
       -0.5405755 ,  0.35313714, -0.4056052 , -0.4376262 , -0.09952006,
        0.1637126 ,  0.31541106,  0.62301755, -0.35932744,  0.3664826 ,
        0.14527467, -0.12249628,  0.32531998, -0.4806752 ,  0.04687217,
        0.8414928 , -0.6379349 ,  0.4821617 , -0.16828884, -0.41009638,
        0.2959564 , -0.18136306,  0.88568926, -0.9685257 ,  0.24162081,
       -0.4473707 , -0.08332538,  0.03992237,  0.01273577,  0.09052487,
       -0.2915893 , -0.1651263 , -0.21028213, -0.4164008 ,  0.18922034,
       -0.6633589 ,  0.46683195, -0.19294062, -0.3274637 , -0.12314048,
        0.3044461 , -0.01567632,  0.67139924,  0.37783837, -0.24034737,
        0.07777663, -0.25305477, -0.3606546 , -0.08390702, -0.03

In [37]:
## Store number of sarcastic, non sarcastic, and total comments made by an author in the past
## First filter and split the dataset into sarcastic comments and non-sarcastic comments

df_train_balanced_sarcastic = df_train_balanced[df_train_balanced['label'] == 1]
df_train_balanced_unsarcastic = df_train_balanced[df_train_balanced['label'] == 0]
print(df_train_balanced_sarcastic.head())
print(df_train_balanced_unsarcastic.head())

    label  ... num_comments_by_author
33      1  ...               0.000305
44      1  ...               0.601349
45      1  ...               0.601349
66      1  ...               0.601349
69      1  ...               0.008024

[5 rows x 14 columns]
   label  ... num_comments_by_author
0      0  ...               0.601349
1      0  ...               0.215397
2      0  ...               0.215443
3      0  ...               0.027880
4      0  ...               0.019079

[5 rows x 14 columns]
