In [1]:
import pandas as pd
import numpy as np
import re
from transformers import AutoModel
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import classification_report

df = pd.read_csv("twitter_sentiment_data (1).csv")

In [2]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43943 entries, 0 to 43942
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  43943 non-null  int64 
 1   message    43943 non-null  object
 2   tweetid    43943 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


In [4]:
duplicate_messages = df[df.duplicated(subset=['message'], keep=False)]['message']
duplicate_messages

1        RT @NatGeoChannel: Watch #BeforeTheFlood right...
4        RT @cnalive: Pranita Biswasi, a Lutheran from ...
6        RT @cnalive: Pranita Biswasi, a Lutheran from ...
12       RT @NatGeoChannel: Watch #BeforeTheFlood right...
14       RT @WorldResources: Reflections on Leonardo Di...
                               ...                        
40535    RT @antiarzE: - do u like green eggs &amp; ham...
42668    RT @exostext: Bbh: boys are hot\nBbh: girls ar...
42697    RT @realDonaldTrump: The global warming we sho...
43132    RT @ClimateReality: We can’t fight climate cha...
43825    RT @billmaher: Not a single question about cli...
Name: message, Length: 3446, dtype: object

In [5]:


df = df.drop_duplicates(subset=['message'], keep='first')
df.head()


Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41033 entries, 0 to 43942
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  41033 non-null  int64 
 1   message    41033 non-null  object
 2   tweetid    41033 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


In [15]:
df['sentiment'].value_counts()

sentiment
 1    20599
 2     9073
 0     7505
-1     3856
Name: count, dtype: int64

In [7]:
df.isnull().value_counts()


sentiment  message  tweetid
False      False    False      41033
Name: count, dtype: int64

In [8]:
def cleaner(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'RT\s+', '', text)
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii') 
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [11]:
df['message'] = df['message'].apply(cleaner)

In [12]:
df = df.drop('tweetid', axis=1)

In [18]:
df = df[df['sentiment'] != 2]

print(df['sentiment'].value_counts())

sentiment
 1    20599
 0     7505
-1     3856
Name: count, dtype: int64


In [19]:
max_len = df_copy['message'].str.len().max()
print(f"Max len: {max_len}")


Max len: 153


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31960 entries, 0 to 43942
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  31960 non-null  int64 
 1   message    31960 non-null  object
dtypes: int64(1), object(1)
memory usage: 749.1+ KB


In [21]:
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['sentiment'], test_size=0.2, random_state=123,stratify=df['sentiment'])

In [22]:
y_train.value_counts()

sentiment
 1    16479
 0     6004
-1     3085
Name: count, dtype: int64

In [23]:
df_train = pd.DataFrame({'message': x_train, 'sentiment': y_train})

In [27]:
df_test = pd.DataFrame({'message': x_test, 'sentiment': y_test})

In [35]:
df_train['sentiment'].value_counts()

sentiment
 1    16479
 0     6004
-1     3085
Name: count, dtype: int64

In [31]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6392 entries, 9774 to 11711
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   message    6392 non-null   object
 1   sentiment  6392 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 149.8+ KB


In [34]:
df_test['sentiment'].value_counts()

sentiment
 1    4120
 0    1501
-1     771
Name: count, dtype: int64

In [36]:
df_majority = df_train[df_train.sentiment == 1]  
df_minority = df_train[df_train.sentiment == -1] 
df_neutral = df_train[df_train.sentiment == 0]  

minority_class_size = len(df_minority)

df_majority_downsampled = df_majority.sample(n=minority_class_size, replace=False, random_state=42)
df_neutral_downsampled = df_neutral.sample(n=minority_class_size, replace=False, random_state=42)

df_train = pd.concat([df_majority_downsampled, df_minority, df_neutral_downsampled])

print(df_train['sentiment'].value_counts())


sentiment
 1    3085
-1    3085
 0    3085
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 9255 entries, 4315 to 5077
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   message    9255 non-null   object
 1   sentiment  9255 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 216.9+ KB


In [37]:
df_train.to_csv('df_train.csv', index=False)

In [None]:
df_test.to_csv('df_test.csv', index=False)