In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('C:/Data/Tweets.csv')

# Display the first few rows of the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [2]:
# Keep only the relevant columns: 'text' for the tweet and 'airline_sentiment' for the sentiment
df = df[['text', 'airline_sentiment']]

# Display the value counts of sentiment labels
print(df['airline_sentiment'].value_counts())

# Map the sentiments to numerical labels: negative = 0, neutral = 1, positive = 2
df['sentiment_label'] = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Display the first few rows of the processed dataset
print(df.head())

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64
                                                text airline_sentiment  \
0                @VirginAmerica What @dhepburn said.           neutral   
1  @VirginAmerica plus you've added commercials t...          positive   
2  @VirginAmerica I didn't today... Must mean I n...           neutral   
3  @VirginAmerica it's really aggressive to blast...          negative   
4  @VirginAmerica and it's a really big bad thing...          negative   

   sentiment_label  
0                1  
1                2  
2                1  
3                0  
4                0  


In [3]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import re

# Load spaCy's English language model
nlp = spacy.load('en_core_web_sm')

# Custom function for text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers (optional)
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize the text using spaCy, remove stop words, and lemmatize
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return ' '.join(cleaned_tokens)

# Sample dataset
corpus = df['text']

# Clean the corpus
cleaned_corpus = [clean_text(doc) for doc in corpus]

# Initialize CountVectorizer (after cleaning the text)
vectorizer = CountVectorizer()

# Fit and transform the cleaned corpus
X = vectorizer.fit_transform(cleaned_corpus)

# Target labels
y = df['sentiment_label'].values

# Convert the result into a DataFrame for better readability (optional)
import pandas as pd
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the result
print(df)

print(X.shape)  # Shape of the feature matrix

       aa  aaaand  aaadvantage  aaalwayslate  aaba  aacom  aacustomerservice  \
0       0       0            0             0     0      0                  0   
1       0       0            0             0     0      0                  0   
2       0       0            0             0     0      0                  0   
3       0       0            0             0     0      0                  0   
4       0       0            0             0     0      0                  0   
...    ..     ...          ...           ...   ...    ...                ...   
14635   0       0            0             0     0      0                  0   
14636   0       0            0             0     0      0                  0   
14637   0       0            0             0     0      0                  0   
14638   0       0            0             0     0      0                  0   
14639   0       0            0             0     0      0                  0   

       aadavantage  aadelay  aadfw  ...

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model on the BoW representation
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Model Accuracy: 79.54%
              precision    recall  f1-score   support

    negative       0.86      0.89      0.88      1889
     neutral       0.60      0.55      0.58       580
    positive       0.75      0.69      0.72       459

    accuracy                           0.80      2928
   macro avg       0.74      0.71      0.72      2928
weighted avg       0.79      0.80      0.79      2928



In [7]:
# Example tweets to predict sentiment for
new_tweets = [
    "The flight was delayed by 2 hours. Terrible experience!",
    "Great service, I loved the extra legroom in business class.",
    "The flight was fine, nothing special."
]

new_cleaned_corpus = [clean_text(doc) for doc in new_tweets]

# Convert the new tweets to the BoW representation
new_X = vectorizer.transform(new_cleaned_corpus)

# Predict sentiment for the new tweets
new_predictions = clf.predict(new_X)

# Map the predictions back to sentiment labels
predicted_sentiments = ['negative' if pred == 0 else 'neutral' if pred == 1 else 'positive' for pred in new_predictions]

# Display the results
for tweet, sentiment in zip(new_tweets, predicted_sentiments):
    print(f"Tweet: {tweet} \nPredicted Sentiment: {sentiment}\n")

Tweet: The flight was delayed by 2 hours. Terrible experience! 
Predicted Sentiment: negative

Tweet: Great service, I loved the extra legroom in business class. 
Predicted Sentiment: positive

Tweet: The flight was fine, nothing special. 
Predicted Sentiment: negative

