In [10]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [11]:
df = pd.read_csv("sentimentdataset.csv")

print(df.head())
print(df.columns)
print(df.shape)


   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! ðŸ’ª          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                                     Hashtags  Retweets  Likes  

In [12]:
# Drop unnamed index columns if present
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Keep only useful columns
df = df[["Text", "Sentiment", "Likes", "Retweets", "Country"]]

# Drop missing text rows
df = df.dropna(subset=["Text"])

print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Text       732 non-null    object 
 1   Sentiment  732 non-null    object 
 2   Likes      732 non-null    float64
 3   Retweets   732 non-null    float64
 4   Country    732 non-null    object 
dtypes: float64(2), object(3)
memory usage: 28.7+ KB
None


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)      # remove links
    text = re.sub(r"@\w+", "", text)         # remove mentions
    text = re.sub(r"#", "", text)            # remove hashtag symbol
    text = re.sub(r"[^a-z\s]", "", text)     # keep letters only
    return text

df["Clean_Text"] = df["Text"].apply(clean_text)
print(df[["Text","Clean_Text"]].head())


                                                Text  \
0   Enjoying a beautiful day at the park!        ...   
1   Traffic was terrible this morning.           ...   
2   Just finished an amazing workout! ðŸ’ª          ...   
3   Excited about the upcoming weekend getaway!  ...   
4   Trying out a new recipe for dinner tonight.  ...   

                                          Clean_Text  
0   enjoying a beautiful day at the park         ...  
1   traffic was terrible this morning            ...  
2   just finished an amazing workout                  
3   excited about the upcoming weekend getaway   ...  
4   trying out a new recipe for dinner tonight   ...  


In [14]:
X = df["Clean_Text"]
y = df["Sentiment"]

tfidf = TfidfVectorizer(stop_words="english")
X_tfidf = tfidf.fit_transform(X)
