##This notebook:
+ Logistic regression for irony detection
+ Accuracy: 0.65

## Required imports

In [None]:
pip install contractions

In [None]:
pip install emoji

In [None]:
# Imports:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

import emoji
from nltk.tokenize import TweetTokenizer
import re
import contractions

##Load irony data

In [None]:
# Load train data
train_path = '/content/drive/MyDrive/TeamLab/semeval_taskA.csv'

df_train = pd.read_csv(train_path, sep='\t', header=0, names=['index',
                                                              'irony_label',
                                                              'tweet'])
                                                

In [None]:
df_train.head()

Unnamed: 0,index,irony_label,tweet
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [None]:
# Check if dataset is balanced

# Classes are 1 and 0. Tweet can either be ironic or non-ironic -> binary classification
classes = df_train.irony_label.unique()

print((df_train.irony_label == 0).sum())
print((df_train.irony_label == 1).sum())

# => Balanced

1916
1901


In [None]:
# Load test data
test_path = '/content/drive/MyDrive/TeamLab/semeval_taskA_test.csv'

df_test = pd.read_csv(test_path, sep='\t', header=0, names=['index',
                                                            'irony_label',
                                                            'tweet'])

print((df_test.irony_label == 0).sum())
print((df_test.irony_label == 1).sum())

df_test.head()

473
311


Unnamed: 0,index,irony_label,tweet
0,1,0,@Callisto1947 Can U Help?||More conservatives ...
1,2,1,"Just walked in to #Starbucks and asked for a ""..."
2,3,0,#NOT GONNA WIN http://t.co/Mc9ebqjAqj
3,4,0,@mickymantell He is exactly that sort of perso...
4,5,1,So much #sarcasm at work mate 10/10 #boring 10...


In [None]:
# Normalisation function

def normalise_tweet(tweet):
    norm_tweet = re.sub("&", "and", tweet)
    norm_tweet = re.sub(r"[<>]", "", norm_tweet)
    norm_tweet = re.sub("http:.*", "<url>", norm_tweet)
    norm_tweet = re.sub("\.\.+", " <ellipsis>", norm_tweet)
    norm_tweet = re.sub("\?\?\?+", "<manyqm>", norm_tweet)
    norm_tweet = re.sub("\?+", "<questm>", norm_tweet)
    norm_tweet = re.sub("!!!+", "<manyexm>", norm_tweet)
    norm_tweet = re.sub("!+", "<exclm>", norm_tweet)
    norm_tweet = re.sub("@", " @", norm_tweet)
    norm_tweet = re.sub("#", " ", norm_tweet)    
    norm_tweet = re.sub(r"[-()/_;:{}=~|.,\[\]]", "", norm_tweet)

    norm_tweet = emoji.demojize(norm_tweet)
    norm_tweet = re.sub(":(?=[\w?])", "<", norm_tweet)
    norm_tweet = re.sub("(?<=[\w?]):", ">", norm_tweet)

    norm_tweet = contractions.fix(norm_tweet)
    norm_tweet = re.sub("(?<=[A-Za-z?])'s", "<possessive>", norm_tweet)

    tokenizer = TweetTokenizer()
    final_tweet = ''

    for token in tokenizer.tokenize(norm_tweet):
        if token.startswith("@"):
            token = "<taggeduser>"
        if token.isnumeric():
            token = "<number>"

        # Lowercase everything, except if word is all caps
        if token.isupper() and len(token) > 1:
            token = token.lower() + " <allcaps>"
        else:
            token = token.lower()

        final_tweet += token + " "
        
    return final_tweet.strip()

In [None]:
x_train = df_train['tweet'].to_numpy()
y_train = df_train['irony_label'].to_numpy()

x_test = df_test['tweet'].to_numpy()
y_test = df_test['irony_label'].to_numpy()

In [None]:
x_train_norm = []
for tweet in x_train :
    x_train_norm.append(normalise_tweet(tweet))

x_test_norm = []
for tweet in x_test:
    x_test_norm.append(normalise_tweet(tweet))

x_train_norm = np.array(x_train_norm)
x_test_norm = np.array(x_test_norm)

In [None]:
# Vectorise

vectorizer = CountVectorizer()
vectorizer.fit(x_train_norm)

# Vectorised dataset
# index of word in training data instead of words
X_train = vectorizer.transform(x_train_norm)
X_test = vectorizer.transform(x_test_norm)

In [None]:
print(x_train_norm[0],'\n', X_train[0])

sweet united nations video just in time for christmas imagine noreligion <url> 
   (0, 1636)	1
  (0, 3303)	1
  (0, 4308)	1
  (0, 4345)	1
  (0, 4692)	1
  (0, 5766)	1
  (0, 5944)	1
  (0, 8389)	1
  (0, 8700)	1
  (0, 9046)	1
  (0, 9096)	1
  (0, 9178)	1


##ML model: logistic regression

In [None]:
# Logistic regression model

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.6505102040816326


In [None]:
# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate
cm = confusion_matrix(y_test, y_pred, labels=classes)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
df_cm

Unnamed: 0,1,0
1,182,129
0,145,328


In [None]:
# Same confusion matrix as above but with percentages

df_cm_percentage = df_cm.copy()
for i in df_cm_percentage:
  df_cm_percentage[i]/=df_cm_percentage[i].sum()

df_cm_percentage

Unnamed: 0,1,0
1,0.556575,0.282276
0,0.443425,0.717724


In [None]:
# Test model

tweet = "I just love it when it rains and I can't do anything!!"
vectTweet = vectorizer.transform(np.array([tweet]))  # vectorizes the tweet using our vectorizer

prediction = classifier.predict(vectTweet)  # predicts class of the tweet
print('Tweet is', 'ironic' if prediction[0]==1 else 'not ironic')

Tweet is ironic
