In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Redback_A/chatgpt.csv")
data = data.drop("Unnamed: 0", axis=1)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219294 entries, 0 to 219293
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweets  219294 non-null  object
 1   labels  219294 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


In [None]:
score_cnt = data.groupby("labels")["tweets"].count()/len(data)
score_cnt

labels
bad        0.491559
good       0.255415
neutral    0.253026
Name: tweets, dtype: float64

In [None]:
import nltk
#nltk.download('punkt')
from nltk import word_tokenize

# Tokenize each item in the review column
word_tokens = [word_tokenize(com) for com in data.tweets]

# Create an empty list to store the length of the reviews
len_tokens = []

# Iterate over the word_tokens list and determine the length of each item
for i in range(len(word_tokens)):
     len_tokens.append(len(word_tokens[i]))

In [None]:
# Create a new feature for the lengh of each review
data['n_words'] = len_tokens 

In [None]:
data.head()

Unnamed: 0,tweets,labels,n_words
0,ChatGPT: Optimizing Language Models for Dialog...,neutral,12
1,"Try talking with ChatGPT, our new AI system wh...",good,26
2,ChatGPT: Optimizing Language Models for Dialog...,neutral,38
3,"THRILLED to share that ChatGPT, our new model ...",good,33
4,"As of 2 minutes ago, @OpenAI released their ne...",bad,27


In [None]:
# Import the TfidfVectorizer and default list of English stop words
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
# Create my stop words to delete some common words
myWords = ["answer", "ask", "chatgpt", "chatbot"]
my_stop_words = list(ENGLISH_STOP_WORDS).append(myWords)

# Build the vectorizer
vect = TfidfVectorizer(stop_words=my_stop_words, 
                       ngram_range=(1, 2), 
                       max_features=50,
                       max_df=0.9,
                       min_df=0.1, 
                       token_pattern=r'\b[^\d\W][^\d\W]+\b').fit(data.tweets)
# Create sparse matrix from the vectorizer
X = vect.transform(data.tweets)

# Create a DataFrame
reviews_transformed = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

In [None]:
reviews_transformed.head()

Unnamed: 0,about,ai,and,can,chatgpt is,co,for,https,https co,in,...,it,of,on,openai,that,the,this,to,with,you
0,0.0,0.0,0.0,0.0,0.0,0.290016,0.544847,0.31021,0.310235,0.0,...,0.0,0.0,0.0,0.653113,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.420807,0.0,0.0,0.0,0.227749,0.427868,0.243607,0.243627,0.0,...,0.37871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.453786,0.0
2,0.0,0.6709,0.0,0.0,0.0,0.363105,0.341079,0.388387,0.388419,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.219929,0.0,0.0,0.42558,0.26651,0.455213,0.45525,0.0,...,0.0,0.0,0.0,0.0,0.302861,0.0,0.0,0.383087,0.0,0.0
4,0.0,0.0,0.0,0.390512,0.0,0.344592,0.0,0.368585,0.368615,0.0,...,0.2865,0.285882,0.0,0.388008,0.0,0.0,0.0,0.0,0.0,0.377708


In [None]:
# Drop the sentence column for merge
data = data.drop('tweets', axis=1)

In [None]:
data.head()

Unnamed: 0,labels,n_words
0,neutral,12
1,good,26
2,neutral,38
3,good,33
4,bad,27


In [None]:
# Merge the dataset
reviews_transformed = reviews_transformed.merge(data, left_index=True, right_index=True)

In [None]:
# Import the logistic regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

# Define X and y
y = reviews_transformed.labels
X = reviews_transformed.drop('labels', axis=1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=456)

In [None]:
# create an instance of Random Forest Classifier
rf = RandomForestClassifier()

# Set the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'max_features': ['sqrt', 'log2']
}

# Use GridSearchCV to search over the parameter grid and find the best parameter values
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Print the best parameter values
print("Best parameters: ", grid_search.best_params_)

# Use the best parameter values to create a random forest classifier
best_rf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], 
                                  max_depth=grid_search.best_params_['max_depth'],
                                  max_features=grid_search.best_params_['max_features'])

# perform cross-validation on the training set
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5)

# print the cross-validation scores
print("Parameter values of best_rf: ", best_rf.get_params())
print("Cross-validation scores:", cv_scores)

Best parameters:  {'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}
Parameter values of best_rf:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Cross-validation scores: [0.55336734 0.55017528 0.55217032 0.5513438  0.55020378]


In [None]:
# fit the model on the training set
random_forest = best_rf.fit(X_train, y_train)

# Predict the labels
y_predicted = random_forest.predict(X_test)

# Print accuracy score and confusion matrix on test set
print('Accuracy on the test set: ', accuracy_score(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted)/len(y_test))

Accuracy on the test set:  0.5504685469344947
[[0.41460134 0.07624433 0.00070681]
 [0.12971112 0.12519665 0.00050161]
 [0.17487859 0.067489   0.01067056]]
