In [42]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
import re
import nltk

In [43]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
import zipfile

# Define the path to the ZIP file and the output CSV file
zip_file_path = r'c:\Users\admin\Downloads\training.1600000.processed.noemoticon.csv.zip'
csv_file_output_path = r'c:\Users\admin\Downloads\training_processed_noemoticon.csv'

# Open the ZIP file and extract the CSV file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # List all files in the ZIP archive (assume there's only one CSV inside)
    csv_file_name = zip_ref.namelist()[0]  # Name of the CSV inside the ZIP
    # Load the CSV file into a DataFrame directly
    df = pd.read_csv(zip_ref.open(csv_file_name), encoding='latin-1', header=None)
    # Save DataFrame to a new CSV file
    df.to_csv(csv_file_output_path, index=False)


In [46]:
df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [47]:
# Assign column names to the dataset
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

In [48]:
# Display the first few rows of the dataset
print(df.head())

   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [49]:
# Keep only the relevant columns: 'target' and 'text'
df = df[['target', 'text']]

In [50]:
# Map target values to 0 = negative, 1 = neutral, 2 = positive
df['target'] = df['target'].map({0: 0, 2: 1, 4: 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['target'].map({0: 0, 2: 1, 4: 2})


In [51]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
# Preprocessing function to clean tweets
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text)    # Remove mentions
    text = re.sub(r'#\w+', '', text)    # Remove hashtags
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = text.lower()                 # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters

    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    return text


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['text'] = df['text'].apply(preprocess_text)

In [35]:
# Split data into training and testing sets
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Convert text data to numerical using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [37]:
# Initialize and train the logistic regression model
#model = LogisticRegression(max_iter=200)
model = LogisticRegression(max_iter=200, class_weight='balanced')
model.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'max_iter': [100, 200, 300]}
grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train_tfidf, y_train)

ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=2.

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams and bigrams

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, labels=[0, 1, 2], target_names=['Negative', 'Neutral', 'Positive'], zero_division=1))