1. Setup and Data Loading

In [2]:
# Import VADER sentiment analyzer
import pandas as pd
import nltk.sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download required lexicon if not already present
try:
    nltk.download('vader_lexicon', quiet=True)
except Exception as e:
    print(f"Error downloading vader_lexicon: {e}")

In [9]:
# Read from the extracted path instead of zip file
df= pd.read_csv(r"C:\Users\LENOVO\Practice_Twitter.zip")


2. Data Preprocessing and Sentiment Analysis

    2.1 VADER Sentiment Analysis


In [10]:
# Condition
sid = SentimentIntensityAnalyzer()

# Function to get sentiment label
def get_sentiment(text):
    # Get the polarity scores using VADER
    scores = sid.polarity_scores(text)
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [12]:
import os

try:
    # Apply sentiment analysis to the 'Heading' column
    df['Sentiment'] = df['Heading'].apply(get_sentiment)
    
    # Define output path using os.path for better cross-platform compatibility
    Twitter_sentiment = os.path.join(os.path.expanduser("~"), "Practice_Twittersent.csv")
    
    # Save to CSV with error handling
    df.to_csv(Twitter_sentiment, index=False)
    print(f"File successfully saved to: {Twitter_sentiment}")
    
except Exception as e:
    print(f"An error occurred: {e}")

File successfully saved to: C:\Users\LENOVO\Practice_Twittersent.csv


In [13]:
try:
    # Read the CSV file using the same path where we saved it earlier
    tweet = pd.read_csv(Twitter_sentiment)  # Using the path variable we defined earlier
    
    # Print basic information about the dataset
    print("\nDataset Info:")
    print(tweet.info())
    
    print("\nFirst few rows:")
    print(tweet.head())
    
    print("\nSentiment Distribution:")
    print(tweet['Sentiment'].value_counts())
    
except FileNotFoundError:
    print(f"Error: File not found at {Twitter_sentiment}")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Heading    14000 non-null  object
 1   Sentiment  14000 non-null  object
dtypes: object(2)
memory usage: 218.9+ KB
None

First few rows:
                                             Heading Sentiment
0  is upset that he can't update his Facebook by ...  Negative
1  @Kenichan I dived many times for the ball. Man...  Positive
2    my whole body feels itchy and like its on fire   Negative
3  @nationwideclass no, it's not behaving at all....  Negative
4                      @Kwesidei not the whole crew    Neutral

Sentiment Distribution:
Sentiment
Negative    5776
Positive    4335
Neutral     3889
Name: count, dtype: int64


In [14]:
import numpy as np

# Create numerical labels for all sentiment classes
labels = np.where(tweet['Sentiment'] == 'Positive', 2,
                 np.where(tweet['Sentiment'] == 'Neutral', 1, 0))

# Alternative approach using label encoding
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# labels = le.fit_transform(tweet['Sentiment'])

# Add the numerical labels back to the dataframe if needed
tweet['sentiment_label'] = labels

# Verify the encoding
print("\nLabel Distribution:")
print(pd.value_counts(labels))
print("\nMapping:")
print("2 = Positive")
print("1 = Neutral")
print("0 = Negative")




Label Distribution:
0    5776
2    4335
1    3889
Name: count, dtype: int64

Mapping:
2 = Positive
1 = Neutral
0 = Negative


  print(pd.value_counts(labels))


    2.2 Data Cleaning and Text Processing
        The following steps are performed:
        - Remove URLs, hashtags, and mentions
        - Convert text to lowercase
        - Remove punctuation and special characters
        - Tokenization
        - Remove stopwords
        - Stemming

In [16]:
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

# Download required NLTK data
try:
    nltk.download('stopwords', quiet=True)
except Exception as e:
    print(f"Error downloading stopwords: {e}")

def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    try:
        # Handle None or non-string inputs
        if tweet is None or not isinstance(tweet, str):
            return []
            
        stemmer = PorterStemmer()
        stopwords_english = stopwords.words('english')
        
        # Convert to lowercase
        tweet = tweet.lower()
        
        # Remove stock market tickers like $GE
        tweet = re.sub(r'\$\w*', '', tweet)
        # Remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # Remove hyperlinks
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
        # Remove hashtags (only the # symbol)
        tweet = re.sub(r'#', '', tweet)
        # Remove emojis and special characters
        tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
        
        # Tokenize tweets
        tokenizer = TweetTokenizer(preserve_case=False, 
                                 strip_handles=True,
                                 reduce_len=True)
        tweet_tokens = tokenizer.tokenize(tweet)

        tweets_clean = []
        for word in tweet_tokens:
            if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):   # remove punctuation
                stem_word = stemmer.stem(word)     # stemming word
                tweets_clean.append(stem_word)

        return tweets_clean
        
    except Exception as e:
        print(f"Error processing tweet: {e}")
        return []

def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment labels
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
    """
    try:
        # Convert np array to list
        yslist = np.squeeze(ys).tolist()

        if len(tweets) != len(yslist):
            raise ValueError("Number of tweets and labels must match")

        # Build frequency dictionary
        freqs = {}
        for y, tweet in zip(yslist, tweets):
            for word in tweet:
                pair = (word, y)
                freqs[pair] = freqs.get(pair, 0) + 1
                
        return freqs
        
    except Exception as e:
        print(f"Error building frequencies: {e}")
        return {}

In [17]:
try:
    # Process tweets with progress tracking
    print("Processing tweets...")
    tweet['Heading'] = tweet['Heading'].apply(process_tweet)
    
    # Verify processing
    print(f"\nProcessed {len(tweet)} tweets")
    
    # Show a sample of processed tweets
    print("\nSample of processed tweets:")
    for i, processed_tweet in enumerate(tweet['Heading'].head(3)):
        print(f"Tweet {i+1}: {processed_tweet}")
        
except Exception as e:
    print(f"Error processing tweets: {e}")

Processing tweets...

Processed 14000 tweets

Sample of processed tweets:
Tweet 1: ['upset', "can't", 'updat', 'facebook', 'text', '...', 'might', 'cri', 'result', 'school', 'today', 'also', 'blah']
Tweet 2: ['dive', 'mani', 'time', 'ball', 'manag', 'save', '50', 'rest', 'go', 'bound']
Tweet 3: ['whole', 'bodi', 'feel', 'itchi', 'like', 'fire']


4. Model Training and Evaluation

In [20]:
from sklearn.model_selection import train_test_split

try:
    # Use the sentiment_label column we created earlier instead of 'type'
    X = tweet.Heading
    y = tweet.sentiment_label  # or tweet.Sentiment for categorical labels
    
    # Create train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=0.2, 
        random_state=42,
        stratify=y  # Ensure balanced classes in both splits
    )
    
    # Print split information
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print("\nLabel distribution in training set:")
    print(pd.Series(y_train).value_counts())
    print("\nLabel distribution in test set:")
    print(pd.Series(y_test).value_counts())
    
except Exception as e:
    print(f"Error during train-test split: {e}")

Training set size: 11200
Test set size: 2800

Label distribution in training set:
sentiment_label
0    4621
2    3468
1    3111
Name: count, dtype: int64

Label distribution in test set:
sentiment_label
0    1155
2     867
1     778
Name: count, dtype: int64


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

try:
    # Convert tokens to strings for vectorization
    X_train_strings = [' '.join(map(str, tokens)) for tokens in X_train]
    X_test_strings = [' '.join(map(str, tokens)) for tokens in X_test]
    
    # Initialize vectorizer with parameters
    vectorizer = CountVectorizer(
        max_features=5000,  # Limit features to prevent memory issues
        min_df=2,          # Ignore terms that appear in less than 2 documents
        max_df=0.95        # Ignore terms that appear in more than 95% of documents
    )
    
    # Fit and transform training data
    X_train_cv = vectorizer.fit_transform(X_train_strings)
    # Transform test data
    X_test_cv = vectorizer.transform(X_test_strings)
    
    # Print vectorization results
    print(f"\nTraining data shape: {X_train_cv.shape}")
    print(f"Test data shape: {X_test_cv.shape}")
    print(f"Number of features: {len(vectorizer.get_feature_names_out())}")
    
    
except Exception as e:
    print(f"Error during vectorization: {e}")


Training data shape: (11200, 4649)
Test data shape: (2800, 4649)
Number of features: 4649


MultinomialNB

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

try:
    # Initialize and train the model
    model = MultinomialNB()
    model.fit(X_train_cv, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_cv)
    y_pred_test = model.predict(X_test_cv)
    
    # Print training scores
    print("\nTraining Accuracy: {:.2f}%".format(model.score(X_train_cv, y_train) * 100))
    print("Test Accuracy: {:.2f}%".format(model.score(X_test_cv, y_test) * 100))
    
    # Print detailed classification report
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, y_pred_test, 
                              target_names=['Negative', 'Neutral', 'Positive']))
    
    # Print confusion matrix
    print("\nConfusion Matrix (Test Set):")
    print(confusion_matrix(y_test, y_pred_test))
    
except Exception as e:
    print(f"Error during model training and evaluation: {e}")


Training Accuracy: 80.79%
Test Accuracy: 66.96%

Classification Report (Test Set):
              precision    recall  f1-score   support

    Negative       0.65      0.75      0.70      1155
     Neutral       0.69      0.50      0.58       778
    Positive       0.69      0.71      0.70       867

    accuracy                           0.67      2800
   macro avg       0.68      0.65      0.66      2800
weighted avg       0.67      0.67      0.67      2800


Confusion Matrix (Test Set):
[[871 111 173]
 [285 388 105]
 [189  62 616]]


DecisionTreeClassifier

In [30]:
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

try:
    # Initialize Decision Tree with parameters
    model = tree.DecisionTreeClassifier(
        max_depth=10,           # Prevent overfitting
        min_samples_split=5,    # Minimum samples required to split
        min_samples_leaf=2,     # Minimum samples required at leaf node
        random_state=42         # For reproducibility
    )
    
    # Train the model
    model.fit(X_train_cv, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_cv)
    y_pred_test = model.predict(X_test_cv)
    
    # Print scores
    print("\nTraining Accuracy: {:.2f}%".format(model.score(X_train_cv, y_train) * 100))
    print("Test Accuracy: {:.2f}%".format(model.score(X_test_cv, y_test) * 100))
    
    # Print detailed classification report
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, y_pred_test,
                              target_names=['Negative', 'Neutral', 'Positive']))
    
    # Print confusion matrix
    print("\nConfusion Matrix (Test Set):")
    print(confusion_matrix(y_test, y_pred_test))
    
    # Optional: Print feature importance
    feature_names = vectorizer.get_feature_names_out()
    importances = model.feature_importances_
    indices = np.argsort(importances)[-10:]  # Get indices of top 10 features
    
    print("\nTop 10 Most Important Features:")
    for idx in reversed(indices):
        print(f"{feature_names[idx]}: {importances[idx]:.4f}")

except Exception as e:
    
    print(f"Error during model training and evaluation: {e}")


Training Accuracy: 50.96%
Test Accuracy: 50.07%

Classification Report (Test Set):
              precision    recall  f1-score   support

    Negative       0.45      0.92      0.61      1155
     Neutral       0.00      0.00      0.00       778
    Positive       0.76      0.39      0.51       867

    accuracy                           0.50      2800
   macro avg       0.40      0.44      0.37      2800
weighted avg       0.42      0.50      0.41      2800


Confusion Matrix (Test Set):
[[1065    0   90]
 [ 763    0   15]
 [ 529    1  337]]

Top 10 Most Important Features:
wish: 0.1416
hope: 0.1152
love: 0.0976
lol: 0.0836
hate: 0.0783
sad: 0.0733
good: 0.0721
like: 0.0711
miss: 0.0706
bad: 0.0688


RandomForestClassifier

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

try:
    # Initialize Random Forest with optimized parameters
    model = RandomForestClassifier(
        n_estimators=100,      # More trees for better performance
        max_depth=20,          # Control tree depth to prevent overfitting
        min_samples_split=5,   # Minimum samples required to split
        min_samples_leaf=2,    # Minimum samples at leaf nodes
        random_state=42,       # For reproducibility
        n_jobs=-1              # Use all available cores
    )
    
    # Train the model
    model.fit(X_train_cv, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_cv)
    y_pred_test = model.predict(X_test_cv)
    
    # Print accuracies
    print(f"Training Accuracy: {accuracy_score(y_train, y_pred_train):.2%}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test):.2%}")
    
    # Print detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_test,
                              target_names=['Negative', 'Neutral', 'Positive']))
    
    # Print confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_test))
    
    # Print feature importance
    feature_names = vectorizer.get_feature_names_out()
    importances = model.feature_importances_
    indices = np.argsort(importances)[-10:]  # Get top 10 features
    
    print("\nTop 10 Most Important Features:")
    for idx in reversed(indices):
        print(f"{feature_names[idx]}: {importances[idx]:.4f}")

except Exception as e:
    print(f"Error during model training and evaluation: {e}")

Training Accuracy: 54.67%
Test Accuracy: 54.14%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.47      0.95      0.63      1155
     Neutral       0.00      0.00      0.00       778
    Positive       0.85      0.49      0.62       867

    accuracy                           0.54      2800
   macro avg       0.44      0.48      0.42      2800
weighted avg       0.46      0.54      0.45      2800


Confusion Matrix:
[[1092    0   63]
 [ 765    0   13]
 [ 443    0  424]]

Top 10 Most Important Features:
wish: 0.0769
love: 0.0450
hope: 0.0429
good: 0.0366
hate: 0.0363
sick: 0.0356
lol: 0.0348
bad: 0.0266
like: 0.0259
thank: 0.0221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Support Vector Machine (SVM)

In [32]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

try:
    # Initialize SVM with optimized parameters
    model = SVC(
        kernel='linear',     # Linear kernel for text classification
        C=1.0,              # Regularization parameter
        random_state=42,     # For reproducibility
        probability=True,    # Enable probability estimates
        class_weight='balanced'  # Handle class imbalance
    )
    
    print("Training SVM model (this may take a while)...")
    
    # Train the model
    model.fit(X_train_cv, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_cv)
    y_pred_test = model.predict(X_test_cv)
    
    # Print accuracies
    print(f"\nTraining Accuracy: {accuracy_score(y_train, y_pred_train):.2%}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test):.2%}")
    
    # Print detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_test,
                              target_names=['Negative', 'Neutral', 'Positive']))
    
    # Print confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_test))

except Exception as e:
    print(f"Error during model training and evaluation: {e}")

Training SVM model (this may take a while)...

Training Accuracy: 90.38%
Test Accuracy: 75.68%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.79      0.73      0.76      1155
     Neutral       0.73      0.82      0.77       778
    Positive       0.75      0.74      0.74       867

    accuracy                           0.76      2800
   macro avg       0.75      0.76      0.76      2800
weighted avg       0.76      0.76      0.76      2800


Confusion Matrix:
[[840 151 164]
 [ 87 638  53]
 [142  84 641]]


5. Prediction Function
Test the model with custom input text to predict sentiment.

In [33]:
try:
    # Input text to analyze
    test_text = ['Murder']
    
    # Transform input using the same vectorizer
    test_vector = vectorizer.transform(test_text)
    
    # Make prediction
    prediction = model.predict(test_vector)
    
    # Get probability scores if model supports it
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(test_vector)[0]
        confidence = max(probabilities) * 100
    else:
        confidence = None
    
    # Map prediction to sentiment
    sentiment_map = {
        0: 'Negative',
        1: 'Neutral',
        2: 'Positive'
    }
    predicted_sentiment = sentiment_map.get(prediction[0], 'Unknown')
    
    # Print results
    print(f"\nInput text: '{test_text[0]}'")
    print(f"Predicted sentiment: {predicted_sentiment}")
    if confidence:
        print(f"Confidence: {confidence:.2f}%")
        
    # Optional: Show probabilities for each class if available
    if hasattr(model, 'predict_proba'):
        print("\nClass probabilities:")
        for sentiment, prob in zip(['Negative', 'Neutral', 'Positive'], probabilities):
            print(f"{sentiment}: {prob:.2f}")

except Exception as e:
    print(f"Error during prediction: {e}")


Input text: 'Murder'
Predicted sentiment: Negative
Confidence: 58.37%

Class probabilities:
Negative: 0.58
Neutral: 0.25
Positive: 0.16
