#installing nesseccary libraries

In [1]:
!pip install pandas
!pip install nltk
!pip install numpy
!pip install scikit-learn



In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Tuple
import numpy as np

#define a function to load data

In [8]:
def load_data():
    files = ["processedNegative.csv", "processedPositive.csv", "processedNeutral.csv"]
    try:
        df = pd.DataFrame(columns=["text", "sentiment"])
        for file in files:
            with open("data/" + file, "r") as f:
                content = f.read()
                tweets = content.split(",")
                tmp_df = pd.DataFrame(tweets, columns=["text"])
                sentiment = file[9:-4]
                tmp_df["sentiment"] = sentiment
                df = pd.concat([df, tmp_df])

    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    return df

call load_data() and print a sample

In [11]:
df = load_data()
print(df.head())


                                                text sentiment
0              How unhappy  some dogs like it though  Negative
1  talking to my over driver about where I'm goin...  Negative
2  Does anybody know if the Rand's likely to fall...  Negative
3         I miss going to gigs in Liverpool unhappy   Negative
4      There isnt a new Riverdale tonight ? unhappy   Negative


#define textpreprocessor class

In [5]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download('wordnet')
nltk.download('stopwords')

class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def preprocess(self, text, config):
        """
        config = {
            'case': 'original' or 'lower',
            'special_chars': 'keep' or 'remove',
            'stop_words': 'keep' or 'remove',
            'normalization': 'none' or 'stem' or 'lemma'
        }
        """
        # Case handling
        if config['case'] == 'lower':
            text = text.lower()

        # Special characters
        if config['special_chars'] == 'remove':
            text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenization
        words = text.split()

        # Stop words
        if config['stop_words'] == 'remove':
            words = [w for w in words if w not in self.stop_words]

        # Normalization
        if config['normalization'] == 'stem':
            words = [self.stemmer.stem(w) for w in words]
        elif config['normalization'] == 'lemma':
            words = [self.lemmatizer.lemmatize(w) for w in words]

        return ' '.join(words)

    # Generate all 18 combinations
    def get_preprocessing_configs(self):
        configs = []
        for case in ['original', 'lower']:
            for special_chars in ['keep', 'remove']:
                for stop_words in ['keep', 'remove']:
                    for norm in ['none', 'stem', 'lemma']:
                        configs.append({
                            'case': case,
                            'special_chars': special_chars,
                            'stop_words': stop_words,
                            'normalization': norm
                        })
        return configs[:2]  # Return first 18 combinations

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


#preprocess the data

In [6]:
preprocessor = TextPreprocessor()
configs = preprocessor.get_preprocessing_configs()

# Process tweets with each configuration
processed_datasets = []
for config in configs:
    processed_df = df.copy()
    processed_df['text'] = processed_df['text'].apply(
        lambda x: preprocessor.preprocess(x, config)
    )
    processed_datasets.append((config, processed_df))

# Save each processed dataset
for i, (config, dataset) in enumerate(processed_datasets):
    dataset.to_csv(f'processed_dataset_{i+1}.csv', index=False)

#load the preprocessed data

In [7]:
from sklearn.model_selection import train_test_split

def create_preprocessed_datasets(df, preprocessor):
    # Get all preprocessing configurations
    configs = preprocessor.get_preprocessing_configs()
    datasets = []
    
    for i, config in enumerate(configs, 1):
        print(f"Creating dataset {i}/18 with config: {config}")
        processed_df = df.copy()
        processed_df['text'] = processed_df['text'].apply(
            lambda x: preprocessor.preprocess(x, config)
        )
        datasets.append((f"dataset_{i}", processed_df, config))
    return datasets

#vectorize dataset

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Tuple
import numpy as np
import pandas as pd

def VectorizeWords(DataFrame: pd.DataFrame) -> Tuple[TfidfVectorizer, np.ndarray, pd.Series]:
    """
    Convert text data to TF-IDF vectors
    
    Args:
        DataFrame: pandas DataFrame containing 'text' and 'sentiment' columns
    
    Returns:
        Tuple containing:
        - fitted TfidfVectorizer
        - transformed text data as numpy array
        - sentiment labels as pandas Series
    """
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(DataFrame['text']).toarray()
    y = DataFrame['sentiment']
    
    return vectorizer, X, y

In [9]:
def prepare_datasets(datasets):
    prepared_data = []
    
    for name, df, config in datasets:
        # Vectorize
        vectorizer, X, y = VectorizeWords(df)
        
        # Split into train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        prepared_data.append({
            'name': name,
            'config': config,
            'vectorizer': vectorizer,
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        })
    
    return prepared_data

In [10]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def train_and_evaluate(prepared_data):
    results = []
    
    for data in prepared_data:
        print(f"\nProcessing {data['name']}...")
        
        # Perform GridSearch
        best_models = perform_gridsearch(data['X_train'], data['y_train'])
        
        # Evaluate each model
        for model_name, model_info in best_models.items():
            best_model = model_info['best_model']
            
            # Make predictions
            y_pred = best_model.predict(data['X_test'])
            
            # Calculate metrics
            accuracy = accuracy_score(data['y_test'], y_pred)
            
            results.append({
                'dataset': data['name'],
                'preprocessing_config': data['config'],
                'model': model_name,
                'best_params': model_info['best_params'],
                'accuracy': accuracy,
                'model_object': best_model,
                'vectorizer': data['vectorizer']
            })
            
            print(f"{model_name} Accuracy: {accuracy:.3f}")
            print("\nClassification Report:")
            print(classification_report(data['y_test'], y_pred))
    
    return results

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def perform_gridsearch(X_train, y_train):
    # Define models and their parameter grids
    models = {
        'svm': {
            'model': SVC(),
            'params': {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto', 0.1, 1],
            }
        }#,
        # 'random_forest': {
        #     'model': RandomForestClassifier(),
        #     'params': {
        #         'n_estimators': [100, 200, 300],
        #         'max_depth': [10, 20, None],
        #         'min_samples_split': [2, 5, 10],
        #         'min_samples_leaf': [1, 2, 4]
        #     }
        # },
        # 'logistic_regression': {
        #     'model': LogisticRegression(),
        #     'params': {
        #         'C': [1.0],#[0.01, 0.1, 1, 10],
        #         'penalty': ['l2'],#['l1', 'l2'],
        #         'solver': ['liblinear', 'saga'],
        #         'max_iter': [1000]
        #     }
        # }
    }
    
    best_models = {}
    
    # Perform GridSearch for each model
    for model_name, model_info in models.items():
        print(f"\nPerforming GridSearch for {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=model_info['model'],
            param_grid=model_info['params'],
            cv=5,                # 5-fold cross-validation
            scoring='f1_macro',   # Using AUC score
            n_jobs=-1,          # Use all CPU cores
            verbose=1
        )
        
        # Fit the grid search
        grid_search.fit(X_train, y_train)
        
        # Store results
        best_models[model_name] = {
            'best_model': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_
        }
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best AUC score: {grid_search.best_score_:.3f}")
    
    return best_models

In [12]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def train_and_evaluate(prepared_data):
    results = []
    
    for data in prepared_data:
        print(f"\nProcessing {data['name']}...")
        
        # Perform GridSearch
        best_models = perform_gridsearch(data['X_train'], data['y_train'])
        
        # Evaluate each model
        for model_name, model_info in best_models.items():
            best_model = model_info['best_model']
            
            # Make predictions
            y_pred = best_model.predict(data['X_test'])
            
            # Calculate metrics
            accuracy = accuracy_score(data['y_test'], y_pred)
            
            results.append({
                'dataset': data['name'],
                'preprocessing_config': data['config'],
                'model': model_name,
                'best_params': model_info['best_params'],
                'accuracy': accuracy,
                'model_object': best_model,
                'vectorizer': data['vectorizer']
            })
            
            print(f"{model_name} Accuracy: {accuracy:.3f}")
            print("\nClassification Report:")
            print(classification_report(data['y_test'], y_pred))
    
    return results

In [None]:
import pickle

def save_model(model, filename):
    """
    Save a model to a file using pickle
    
    Args:
        model: The model object to save
        filename: The path where to save the model
    """
    try:
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model successfully saved to {filename}")
    except Exception as e:
        print(f"Error saving model to {filename}: {str(e)}")

# And here's how to load the model later when you need it:
def load_model(filename):
    """
    Load a model from a file
    
    Args:
        filename: The path to the saved model file
    
    Returns:
        The loaded model object
    """
    try:
        with open(filename, 'rb') as f:
            model = pickle.load(f)
        print(f"Model successfully loaded from {filename}")
        return model
    except Exception as e:
        print(f"Error loading model from {filename}: {str(e)}")
        return None

In [13]:
def main():
    # Load your data using the previously defined load_data function
    df = load_data()
    if df is None:
        print("Error loading data. Please check your data files.")
        return
    
    # Initialize preprocessor
    preprocessor = TextPreprocessor()
    
    # Create different preprocessed versions
    print("Creating preprocessed datasets...")
    datasets = create_preprocessed_datasets(df, preprocessor)
    
    # Prepare datasets for training
    print("\nPreparing datasets for training...")
    prepared_data = prepare_datasets(datasets)
    
    # Train and evaluate models
    print("\nTraining and evaluating models...")
    results = train_and_evaluate(prepared_data)
    
    # Find best performing model
    best_result = max(results, key=lambda x: x['accuracy'])
    print("\nBest performing model:")
    print(f"Dataset: {best_result['dataset']}")
    print(f"Preprocessing config: {best_result['preprocessing_config']}")
    print(f"Model: {best_result['model']}")
    print(f"Accuracy: {best_result['accuracy']:.3f}")
    
    # Save best model and vectorizer
    save_model(best_result['model_object'], 'best_model.pkl')
    save_model(best_result['vectorizer'], 'best_vectorizer.pkl')

if __name__ == "__main__":
    main()

Creating preprocessed datasets...
Creating dataset 1/18 with config: {'case': 'original', 'special_chars': 'keep', 'stop_words': 'keep', 'normalization': 'none'}
Creating dataset 2/18 with config: {'case': 'original', 'special_chars': 'keep', 'stop_words': 'keep', 'normalization': 'stem'}

Preparing datasets for training...

Training and evaluating models...

Processing dataset_1...

Performing GridSearch for svm...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for svm: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best AUC score: 0.878
svm Accuracy: 0.901

Classification Report:
              precision    recall  f1-score   support

    Negative       0.96      0.86      0.91       224
     Neutral       0.85      0.97      0.91       314
    Positive       0.93      0.84      0.88       237

    accuracy                           0.90       775
   macro avg       0.91      0.89      0.90       775
weighted avg       0.91      0.90      0.90       775


KeyboardInterrupt: 