In [3]:
import numpy as np
import os
import pandas as pd

# Displaying file names in the specified directory
for root, dirs, files in os.walk('/kaggle/input'):
    for file in files:
        print(os.path.join(root, file))

# Loading the datasets
train_df = pd.read_csv('/content/twitter_training.csv', header=None, names=['ID', 'Entity', 'Sentiment', 'Content'])
test_df = pd.read_csv('/content/twitter_validation.csv', header=None, names=['ID', 'Entity', 'Sentiment', 'Content'])

# Dropping unnecessary columns
train_df = train_df.drop(columns=['ID', 'Entity'])
test_df = test_df.drop(columns=['ID', 'Entity'])

# Checking for null values
print(train_df.isnull().sum())

# Checking the shape of the training data
print(train_df.shape)

# Removing null values
train_df = train_df.dropna()
test_df = test_df.dropna()

# Removing duplicates
train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

# Verifying that no null values are left in the test data
print(test_df.isnull().sum())

# Combining the training and test datasets
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Checking the shape and duplicates in the combined dataset
print(combined_df.shape)
print(combined_df.duplicated().sum())

# Removing duplicates in the combined dataset
combined_df = combined_df.drop_duplicates()

# Final verification of duplicates
print("Number of duplicates in the combined dataset:", combined_df.duplicated().sum())
print(combined_df.shape)

# Adding a column for the length of the tweet content
combined_df['Content_Length'] = combined_df['Content'].apply(len)

# Displaying the combined dataset
print(combined_df)

# Counting the occurrences of each sentiment
print(combined_df['Sentiment'].value_counts())

# Replacing 'Neutral' and 'Irrelevant' sentiments with 'Neutral'
combined_df['Sentiment'] = combined_df['Sentiment'].apply(lambda x: 'Neutral' if x in ['Neutral', 'Irrelevant'] else x)

# Displaying the count of each sentiment after replacement
print(combined_df['Sentiment'].value_counts())


Sentiment      0
Content      686
dtype: int64
(74682, 2)
Sentiment    0
Content      0
dtype: int64
(70768, 2)
516
Number of duplicates in the combined dataset: 0
(70252, 2)
        Sentiment                                            Content  \
0        Positive  im getting on borderlands and i will murder yo...   
1        Positive  I am coming to the borders and I will kill you...   
2        Positive  im getting on borderlands and i will kill you ...   
3        Positive  im coming on borderlands and i will murder you...   
4        Positive  im getting on borderlands 2 and i will murder ...   
...           ...                                                ...   
70756     Neutral  ♥️ Suikoden 2\n1️⃣ Alex Kidd in Miracle World\...   
70757    Positive  Thank you to Matching funds Home Depot RW paym...   
70759     Neutral  Late night stream with the boys! Come watch so...   
70763  Irrelevant  ⭐️ Toronto is the arts and culture capital of ...   
70764  Irrelevant  tHIS IS ACTUAL