In [None]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
# from sklearn.feature_extraction.text import CountVectorizer


In [None]:
# Definitions of helper function

class MySQLConnection:
    def __init__(self,user,hostname,db,):
        self.user = str(user)
        self.hostname = str(hostname)
        self.db = str(db)
        pword = getpass.getpass("Enter password for user {}".format(user))
        self.engine = create_engine("mysql://{}:{}@{}/{}".format(user,pword,hostname,db))
        # Writing logic in ObjectLogic.py depends on this being named cnx:
        self.cnx = self.engine.connect()
    
    def write_to_db(self, df, table_name):
        try:
            df.to_sql(table_name,con = self.cnx,if_exists='append',index=False)
        except Exception as e:
            print("\n SQL Write error with: ")
            print(df,"\n", e)
    
    def get_table(self,table_name):
        q1 = "SELECT * FROM "
        query = q1 + table_name
        return pd.read_sql(query,self.cnx)
        

In [None]:
# Create sql connection
sql = MySQLConnection('skrs','localhost','sarcasm')

In [None]:
train_data = pd.read_csv('data/train.csv')
test_set = train_data.dropna() # Drop any rows with anything missing. Not too many of them anyway.

In [None]:
# # First, put the dataset into an SQL database for getting general insights
# # train_data.to_sql('initial_data',sql.cnx) # needs to be executed once  -  DONE


# Certain subreddits are more likely to have sarcastic comments.
# Idea: Get the correlation between subreddit and sarcasm. Can be used to weigh results and/or as an additiona validation
# Easy enough to do with MySQL


# Assign to subreddits how many comments they have
"""
create table data_subr_counts as 
select 
    subreddit as sub, 
    count(*) as entry_count
from 
    test_data
group by subreddit
"""


# Leave only the subreddits with comment count > 30 
# (more or less arbitrarily chosen amount but should make the results more statistically significant)
"""
create table test_data_relev as 
select * 
from 
    test_data left join data_subr_counts 
        on test_data.subreddit = data_subr_counts.sub
where entry_count > 30
"""


# Now count how many sarcastic comments and how many non-sarcastic comments are in each subreddit 
"""
create table relevant_nonsarcastic_comment_counts
as select 
    subreddit, 
    count(*) as nonsarc_count 
from 
    test_data_relev 
where 
    label = 0 
group by 
    subreddit
"""
"""    

create table relevant_sarcastic_comment_counts
as select 
    subreddit as sub, 
    count(*) as sarc_count 
from 
    test_data_relev 
where 
    label = 1 
group by 
    subreddit
"""


#And finally calculate the sarcasm ratio for each (definitely) statistically relevant sub
"""
create table relevant_sarcasm_ratio 
as select 
    subreddit,
    sarc_count / (nonsarc_count + sarc_count) as sarcasm_ratio
from 
    relevant_sarcastic_comment_counts join relevant_nonsarcastic_comment_counts
        on relevant_sarcastic_comment_counts.sub = relevant_nonsarcastic_comment_counts.subreddit
"""

# subr_sarcasm_ratio = sql.get_table('relevant_sarcasm_ratio')
# subr_sarcasm_ratio

# Question: 
# How was the data collected? Was it equal amounts of sarcastic and non-sarcastic comments from each subreddit?


# Idea: Amount of sarcasm in comments varies based on the day of the week.


In [None]:
# alternate token_pattern='[a-z]{2,}'


"""
This is the basic version  of the model
"""

vect = CountVectorizer(min_df=5)
vect.fit(test_set['comment'])

X_tr, X_tst, y_train, y_test = train_test_split(test_set['comment'],test_set['label'])

X_train = vect.transform(X_tr)
X_test = vect.transform(X_tst)

feature_names = vect.get_feature_names()
print(feature_names[::2000])
print(len(feature_names))

clf = LogisticRegression()
clf.fit(X_train,y_train)


# # Basic version.
# # Some misspellings can be frequent and should correlate with sarcasm about the same as the correct versions of the words but
# rarer misspellings just bloat the number of features, same with rare words that wont be statistically significant.
# # Conclusion: 
# Ignore words with less than 5 occurences: mind_df = 5
# ngram size = 1 (default)
# nothing else is taken into consideration besides the comment itself




# Basic + 1-2-gramms 
# vect = CountVectorizer(min_df=5,ngram_range=(1,2))
# vect.fit(test_set['comment'])
# score:  0.718619325052215
# (682272, 215441)
# score:  0.7181444432230406
# (682272, 215441)




# Basic + 1-3-gramms 
# vect = CountVectorizer(min_df=5,ngram_range=(1,3))
# vect.fit(test_set['comment'])
# score:  0.7199604265142354
# (682272, 342466)
# score:  0.7205979993404419
# (682272, 342466)


# Basic 2-3-gramms
# vect = CountVectorizer(min_df=5,ngram_range=(2,3))
# vect.fit(test_set['comment'])
# score:  0.695776629658129
# (682272, 302974)



In [None]:


"""
Model with the subreddit added as an additional word
"""

test_set['empty_str'] = " " # Ignoring SettingWithCopyWarning. Value =/= f(index). Just need the right shape
vect = TfidfVectorizer(min_df=5,ngram_range=(1,2))
new_set = pd.DataFrame(test_set.comment + test_set.empty_str + test_set.subreddit)
vect.fit(new_set[0]) # note the 'new_set' instead of test_set['comment']


X_tr, X_tst, y_train, y_test = train_test_split(new_set[0],test_set['label']) # new_set[0] here as well

X_train = vect.transform(X_tr)
X_test = vect.transform(X_tst)

feature_names = vect.get_feature_names()
print(feature_names[::2000])
print(len(feature_names))

clf = LogisticRegression()
clf.fit(X_train,y_train)


score = clf.score(X_test,y_test)
print("score: ", score)
print(X_train.shape)

# Basic + Add subreddit to the features. 
# https://www.dataquest.io/blog/settingwithcopywarning/
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# score:  0.6975882158953501
# (682272, 43217)


# Basic + Add subreddit to the features + 1-2 gramms 
# score:  0.7247883917775091
# (682272, 242406)


# Basic + add subreddit + 1-3 gramms 
# score:  0.7243354952182038
# (682272, 374868)

