In [31]:
import pandas as pd
from pathlib import Path

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Data importation
Technical difficulties prevented me from importing full dataset.  I was able to open the original csv in Excel, export it as a new csv specified to be encoded in UTC-8, and then import that new csv in Jupyter Lab successfully. This limited the dataset to 1M lines, removing some 600,000 rows. This should still be a sizable dataset for analysis, however as the original data was ordered by sentiment, this removed a significant amount of positive sentimented text.

In [20]:
filepath = Path("training8.csv")
tweets = pd.read_csv(filepath, header=None, names=["sentiment", "id", "time", "query", "name", "text"])
tweets

Unnamed: 0,sentiment,id,time,query,name,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
999995,4,1879942807,Thu May 21 23:36:19 PDT 2009,NO_QUERY,divabat,"@healingsinger thank you, i needed that"
999996,4,1879942922,Thu May 21 23:36:20 PDT 2009,NO_QUERY,nick1975,@vactress http://bit.ly/cADea Maybe this is m...
999997,4,1879942975,Thu May 21 23:36:21 PDT 2009,NO_QUERY,znmeb,"@Brat13 Hell, Windows 7 will be out of my pric..."
999998,4,1879943113,Thu May 21 23:36:22 PDT 2009,NO_QUERY,virmani,@jigardoshi neah.. i wish! just reminiscing r...


In [21]:
#Does column 'NO_QUERY' contain any useful data or is every row just 'NO_QUERY'?
tweets['query'].value_counts()

NO_QUERY    1000000
Name: query, dtype: int64

In [22]:
# Tweet id's and query have no data useful for sentiment analysis. There could be something in 'time,' but
# without timezone data it can't be determined what time of day tweets were sent. Names are unlikely to be useful.
tweets = tweets.drop(columns=['id', 'time', 'query', 'name'])
tweets.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [23]:
tweets.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [24]:
tweets.iloc[0:2]

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...


In [25]:
# Lemmatizer script
lemmatizer = WordNetLemmatizer()

def process_text(text): 
    sw = set(stopwords.words('english')) 
    regex = re.compile("[^a-zA-Z ]") 
    re_clean = regex.sub('', text)
    words = word_tokenize(re_clean) 
    lem = [lemmatizer.lemmatize(word) for word in words] 
    output = ' '.join([word.lower() for word in lem if word.lower() not in sw]) 
    return output

In [26]:
tweets['text'] = tweets['text'].apply(lambda x: process_text(x))

In [27]:
tweets.head()

Unnamed: 0,sentiment,text
0,0,switchfoot httptwitpiccomyzl awww thats bummer...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many time ball managed save res...
3,0,whole body feel itchy like fire
4,0,nationwideclass behaving im mad cant see


In [29]:
tweets['text'][0]

'switchfoot httptwitpiccomyzl awww thats bummer shoulda got david carr third day'

## Feature Extraction

In [34]:
# Choose a feature extraction method (e.g., TF-IDF, Gensim, or a pretrained language model) and transform the text data into numerical features.
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(tweets['text'])
y = tweets['sentiment']
X

<1000000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 6159706 stored elements in Compressed Sparse Row format>

## Model Training

In [35]:


# 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)








In [37]:
# Create and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

## Model evaluation

## Sentiment Analysis

In [None]:
# Delete if possible
keyword = 'twitter'
mention = tweets['text'].str.contains(keyword, case=False, na=False)
twitter_df = tweets[mention]
twitter_df

In [None]:
# Use if possible
keyword = 'twitter'
mention = tweets['text'].apply(lambda x: keyword in x)
twitter_df = tweets[mention]


## Visualizations

In [None]:
# Word Cloud

In [None]:
# Sentiment across time? if not just a bar plot