In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Data
train_data = pd.read_csv('macbook_twitter_sentiment_train.csv')
test_data = pd.read_csv('macbook_twitter_sentiment_test.csv')

# Step 2: Assign the correct target column name
target_column = 'Label'  # Replace 'Label' with the correct column name for sentiment labels

# Step 3: Feature Extraction and Data Preparation
vectorizer = TfidfVectorizer(max_features=1000)
X_train_all = vectorizer.fit_transform(train_data['Tweet'])
y_train_all = train_data[target_column]

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

# Step 4: Train the Model
model = XGBClassifier()
model.fit(X_train, y_train)

# Step 5: Evaluate the Model on Validation Set
y_pred_val = model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val:.2f}")

# Optional: Print classification report for detailed metrics on validation set
print(classification_report(y_val, y_pred_val))

# Step 6: Make Predictions on Test Data (without labels)
X_test = vectorizer.transform(test_data['Tweet'])
predictions_test = model.predict(X_test)
print(predictions_test)


Validation Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.48      0.66      0.55       485
           1       0.50      0.31      0.38       515

    accuracy                           0.48      1000
   macro avg       0.49      0.49      0.47      1000
weighted avg       0.49      0.48      0.47      1000

[0 1 1 ... 1 0 1]


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Data
train_data = pd.read_csv('macbook_twitter_sentiment_train.csv')
test_data = pd.read_csv('macbook_twitter_sentiment_test.csv')

# Step 2: Assign the correct target column name
target_column = 'Label'  # Replace 'Label' with the correct column name for sentiment labels

# Step 3: Feature Extraction and Data Preparation
vectorizer = TfidfVectorizer(max_features=1000)
X_train_all = vectorizer.fit_transform(train_data['Tweet'])
y_train_all = train_data[target_column]

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

# Step 4: Train the SVM Model
svm_model = SVC(kernel='linear')  # You can specify different kernels and hyperparameters
svm_model.fit(X_train, y_train)

# Step 5: Evaluate the Model on Validation Set
y_pred_val = svm_model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val:.2f}")

# Optional: Print classification report for detailed metrics on validation set
print(classification_report(y_val, y_pred_val))

# Step 6: Make Predictions on Test Data (without labels)
X_test = vectorizer.transform(test_data['Tweet'])
predictions_test = svm_model.predict(X_test)
print(predictions_test)


Validation Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.48      0.66      0.55       485
           1       0.50      0.31      0.38       515

    accuracy                           0.48      1000
   macro avg       0.49      0.49      0.47      1000
weighted avg       0.49      0.48      0.47      1000

[0 1 1 ... 1 0 1]


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Data
train_data = pd.read_csv('macbook_twitter_sentiment_train.csv')
test_data = pd.read_csv('macbook_twitter_sentiment_test.csv')

# Step 2: Assign the correct target column name
target_column = 'Label'  # Replace 'Label' with the correct column name for sentiment labels

# Step 3: Feature Extraction and Data Preparation
vectorizer = TfidfVectorizer(max_features=1000)
X_train_all = vectorizer.fit_transform(train_data['Tweet'])
y_train_all = train_data[target_column]

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

# Step 4: Train the Logistic Regression Model
logreg_model = LogisticRegression(max_iter=1000)  # You can adjust max_iter and other hyperparameters
logreg_model.fit(X_train, y_train)

# Step 5: Evaluate the Model on Validation Set
y_pred_val = logreg_model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val:.2f}")

# Optional: Print classification report for detailed metrics on validation set
print(classification_report(y_val, y_pred_val))

# Step 6: Make Predictions on Test Data (without labels)
X_test = vectorizer.transform(test_data['Tweet'])
predictions_test = logreg_model.predict(X_test)
print(predictions_test)


Validation Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.48      0.66      0.55       485
           1       0.50      0.31      0.38       515

    accuracy                           0.48      1000
   macro avg       0.49      0.49      0.47      1000
weighted avg       0.49      0.48      0.47      1000

[0 1 1 ... 1 0 1]


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Data
train_data = pd.read_csv('macbook_twitter_sentiment_train.csv')
test_data = pd.read_csv('macbook_twitter_sentiment_test.csv')

# Step 2: Assign the correct target column name
target_column = 'Label'  # Replace 'Label' with the correct column name for sentiment labels

# Step 3: Feature Extraction and Data Preparation
vectorizer = TfidfVectorizer(max_features=1000)
X_train_all = vectorizer.fit_transform(train_data['Tweet'])
y_train_all = train_data[target_column]

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

# Step 4: Train the Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators and other hyperparameters
rf_model.fit(X_train, y_train)

# Step 5: Evaluate the Model on Validation Set
y_pred_val = rf_model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val:.2f}")

# Optional: Print classification report for detailed metrics on validation set
print(classification_report(y_val, y_pred_val))

# Step 6: Make Predictions on Test Data (without labels)
X_test = vectorizer.transform(test_data['Tweet'])
predictions_test = rf_model.predict(X_test)
print(predictions_test)


Validation Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.48      0.66      0.55       485
           1       0.50      0.31      0.38       515

    accuracy                           0.48      1000
   macro avg       0.49      0.49      0.47      1000
weighted avg       0.49      0.48      0.47      1000

[0 1 1 ... 1 0 1]


xgboost tuning

In [5]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

# Initialize the XGBoost model
xgb_model = XGBClassifier()

# Perform Grid Search to find the best parameters
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=3, scoring='accuracy',  verbose=3)
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params_xgb = grid_search_xgb.best_params_
best_estimator_xgb = grid_search_xgb.best_estimator_

print("Best Parameters for XGBoost:", best_params_xgb)



Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.532 total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.509 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.536 total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.532 total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.509 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.536 total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.532 total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.509 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.536 total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.532 total ti

svm tuning

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Initialize the SVM model
svm_model = SVC()

# Perform Grid Search to find the best parameters
grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=param_grid_svm, cv=3, scoring='accuracy',  verbose=3)
grid_search_svm.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params_svm = grid_search_svm.best_params_
best_estimator_svm = grid_search_svm.best_estimator_

print("Best Parameters for SVM:", best_params_svm)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.532 total time=   0.3s
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.509 total time=   0.3s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.526 total time=   0.3s
[CV 1/3] END .................C=0.1, kernel=rbf;, score=0.532 total time=   0.3s
[CV 2/3] END .................C=0.1, kernel=rbf;, score=0.509 total time=   0.3s
[CV 3/3] END .................C=0.1, kernel=rbf;, score=0.526 total time=   0.3s
[CV 1/3] END ................C=1, kernel=linear;, score=0.532 total time=   0.2s
[CV 2/3] END ................C=1, kernel=linear;, score=0.509 total time=   0.2s
[CV 3/3] END ................C=1, kernel=linear;, score=0.536 total time=   0.2s
[CV 1/3] END ...................C=1, kernel=rbf;, score=0.532 total time=   0.3s
[CV 2/3] END ...................C=1, kernel=rbf;, score=0.509 total time=   0.2s
[CV 3/3] END ...................C=1, kernel=rbf;,

Logistic Regression tuning

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid for Logistic Regression
param_grid_logreg = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)

# Perform Grid Search to find the best parameters
grid_search_logreg = GridSearchCV(estimator=logreg_model, param_grid=param_grid_logreg, cv=3, scoring='accuracy',  verbose=3)
grid_search_logreg.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params_logreg = grid_search_logreg.best_params_
best_estimator_logreg = grid_search_logreg.best_estimator_

print("Best Parameters for Logistic Regression:", best_params_logreg)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 2/3] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 3/3] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 1/3] END .................C=0.1, penalty=l2;, score=0.532 total time=   0.0s
[CV 2/3] END .................C=0.1, penalty=l2;, score=0.509 total time=   0.0s
[CV 3/3] END .................C=0.1, penalty=l2;, score=0.536 total time=   0.0s
[CV 1/3] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 2/3] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 3/3] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 1/3] END ...................C=1, penalty=l2;, score=0.532 total time=   0.0s
[CV 2/3] END ...................C=1, penalty=l2;, score=0.509 total time=   0.0s
[CV 3/3] END ...................C=1, penalty=l2;,

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jayan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jayan\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\jayan\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



# RF tuning

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Perform Grid Search to find the best parameters
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=3, scoring='accuracy',  verbose=3)
grid_search_rf.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params_rf = grid_search_rf.best_params_
best_estimator_rf = grid_search_rf.best_estimator_

print("Best Parameters for Random Forest:", best_params_rf)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END ..max_depth=None, n_estimators=100;, score=0.518 total time=   0.1s
[CV 2/3] END ..max_depth=None, n_estimators=100;, score=0.509 total time=   0.1s
[CV 3/3] END ..max_depth=None, n_estimators=100;, score=0.536 total time=   0.1s
[CV 1/3] END ..max_depth=None, n_estimators=200;, score=0.532 total time=   0.3s
[CV 2/3] END ..max_depth=None, n_estimators=200;, score=0.509 total time=   0.3s
[CV 3/3] END ..max_depth=None, n_estimators=200;, score=0.536 total time=   0.3s
[CV 1/3] END ..max_depth=None, n_estimators=300;, score=0.532 total time=   0.4s
[CV 2/3] END ..max_depth=None, n_estimators=300;, score=0.509 total time=   0.4s
[CV 3/3] END ..max_depth=None, n_estimators=300;, score=0.536 total time=   0.4s
[CV 1/3] END .....max_depth=5, n_estimators=100;, score=0.518 total time=   0.0s
[CV 2/3] END .....max_depth=5, n_estimators=100;, score=0.509 total time=   0.0s
[CV 3/3] END .....max_depth=5, n_estimators=100;

# overall sentiment score

In [9]:
import numpy as np

# Using each best estimator to predict on the test data
predictions_test_xgb = best_estimator_xgb.predict(X_test)
predictions_test_svm = best_estimator_svm.predict(X_test)
predictions_test_logreg = best_estimator_logreg.predict(X_test)
predictions_test_rf = best_estimator_rf.predict(X_test)

# Creating an array of predictions for each model
all_predictions = np.array([
    predictions_test_xgb,
    predictions_test_svm,
    predictions_test_logreg,
    predictions_test_rf
])

# Calculating the overall sentiment scores of each fine tuned  models
overall_sentiment_score = np.mean(all_predictions, axis=0)
print("Overall Sentiment Score:")
print(overall_sentiment_score)


Overall Sentiment Score:
[0. 1. 1. ... 1. 0. 1.]


In [10]:
#Calculating the overall sentiment score of the ensemble model
overall_sentiment_score = np.mean(overall_sentiment_score)  # Calculating the mean of all predictions (proportion of positive sentiments)
print("Overall Sentiment Score (Proportion of Positive Sentiments):", overall_sentiment_score)


Overall Sentiment Score (Proportion of Positive Sentiments): 0.3016


the overall sentiment leans slightly towards the positive side, but it's not overwhelmingly positive

In [13]:
from textblob import TextBlob
from collections import Counter

# Function to get sentiment polarity for a text
def get_sentiment_polarity(text):
    return TextBlob(text).sentiment.polarity

# Calculate sentiment polarity for each tweet in the test dataset
test_data['Sentiment_Polarity'] = test_data['Tweet'].apply(get_sentiment_polarity)

# Separate tweets based on their sentiment polarity
positive_tweets = test_data[test_data['Sentiment_Polarity'] > 0]['Tweet']
negative_tweets = test_data[test_data['Sentiment_Polarity'] < 0]['Tweet']

# Function to extract words from tweets
def extract_words(tweets):
    words = []
    for tweet in tweets:
        words.extend(tweet.split())
    return words

# Extract positive and negative words from the dataset
positive_words = extract_words(positive_tweets)
negative_words = extract_words(negative_tweets)

# Get the most common positive and negative words
most_common_positive = Counter(positive_words).most_common(10)
most_common_negative = Counter(negative_words).most_common(10)

# Function to find tweets containing certain words
def find_tweets_with_words(tweets, words):
    selected_tweets = []
    for tweet in tweets:
        if any(word in tweet for word in words):
            selected_tweets.append(tweet)
    return selected_tweets

# Find positive and negative tweets containing the most common positive and negative words
positive_tweets_containing_words = find_tweets_with_words(positive_tweets, [word[0] for word in most_common_positive])
negative_tweets_containing_words = find_tweets_with_words(negative_tweets, [word[0] for word in most_common_negative])

# Print 5 positive tweets containing positive words
print("Top 5 Positive Tweets with Positive Words:")
for tweet in positive_tweets_containing_words[:5]:
    print(tweet)

# Print 5 negative tweets containing negative words
print("\nTop 5 Negative Tweets with Negative Words:")
for tweet in negative_tweets_containing_words[:5]:
    print(tweet)


Top 5 Positive Tweets with Positive Words:
The new MacBook Pro is incredibly fast and efficient!
MacBook's retina display is stunning, great for design work.
The new MacBook Pro is incredibly fast and efficient!
Switching to MacBook has been a great experience, so user-friendly.
MacBook's retina display is stunning, great for design work.

Top 5 Negative Tweets with Negative Words:
Battery life on the latest MacBook is a bad.
The touch bar on the MacBook Pro is a game changer.
Battery life on the latest MacBook is a bad.
My MacBook keeps overheating, not what I expected.
Frustrated with the lack of ports on my MacBook.
