In [125]:
import csv
from sklearn.model_selection import train_test_split
with open('file.csv',encoding="utf8") as file:
    reader = csv.reader(file)
    data = list(reader)
    
data = [[row[0], row[1], row[2]] for row in data]

X = [row[1] for row in data]  
y = [row[2] for row in data]  
#80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [126]:
tech_keywords = ["technology", "innovation", "software", "data", "internet", "digital","Google","Facebook","phone"]
transport_keywords = ["transportation", "vehicle", "car", "traffic", "road", "train","highway","traffic"]
class ExtendedLexiconClassifier():
    def __init__(self, tech_keywords, transport_keywords):
        """
        Initialize the Extended Lexicon Classifier.

        Parameters:
        tech_keywords (list): List of keywords related to technology.
        transport_keywords (list): List of keywords related to transportation.
        """
        self.tech_keywords = set(tech_keywords)
        self.transport_keywords = set(transport_keywords)

    def count_tech_words(self, text):
        """
        Count the number of technology-related words in the input text.

        Parameters:
        text (str): Input text.

        Returns:
        count (int): Count of technology-related words.
        """
        return sum(text.lower().count(word) for word in self.tech_keywords)

    def count_transport_words(self, text):
        """
        Count the number of transportation-related words in the input text.

        Parameters:
        text (str): Input text.

        Returns:
        count (int): Count of transportation-related words.
        """
        return sum(text.lower().count(word) for word in self.transport_keywords)
    
    def count_exclamation_marks(self, text):
        """
        Count the number of exclamation marks in the input text.

        Parameters:
        text (str): Input text.

        Returns:
        count (int): Count of exclamation marks.
        """
        return text.count('!')

    def count_capital_letters(self, text):
        """
        Count the number of capital letters in the input text.

        Parameters:
        text (str): Input text.

        Returns:
        count (int): Count of capital letters.
        """
        return sum(1 for char in text if char.isupper())


In [134]:
#LOGISTIC & LINEARSVC ON TEST WITHOUT LEXICON

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the text data like hw
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize the classifiers
logistic_classifier = LogisticRegression()
linear_svc_classifier = LinearSVC()

import random
random.seed(1)


# Train and evaluate Logistic Regression classifier
logistic_classifier.fit(X_train_vec, y_train)
logistic_preds = logistic_classifier.predict(X_test_vec)

precision_logistic = precision_score(y_test, logistic_preds, average='micro')
recall_logistic = recall_score(y_test, logistic_preds, average='micro')
f1_logistic = f1_score(y_test, logistic_preds, average='micro')
precision_logistic_ma = precision_score(y_test, logistic_preds, average='macro')
recall_logistic_ma = recall_score(y_test, logistic_preds, average='macro')
f1_logistic_ma = f1_score(y_test, logistic_preds, average='macro')


# Train and evaluate LinearSVC classifier
linear_svc_classifier.fit(X_train_vec, y_train)
linear_svc_preds = linear_svc_classifier.predict(X_test_vec)

precision_linear_svc = precision_score(y_test, linear_svc_preds, average='micro')
recall_linear_svc = recall_score(y_test, linear_svc_preds, average='micro')
f1_linear_svc = f1_score(y_test, linear_svc_preds, average='micro')
precision_linear_svc_ma = precision_score(y_test, linear_svc_preds, average='macro')
recall_linear_svc_ma = recall_score(y_test, linear_svc_preds, average='macro')
f1_linear_svc_ma = f1_score(y_test, linear_svc_preds, average='macro')

# Print evaluation metrics for Logistic Regression
print("Logistic Regression:")
print("Precision Micro:", precision_logistic)
print("Recall Micro:", recall_logistic)
print("F1 Score Macro:", f1_logistic)
print("Precision Macro:", precision_logistic_ma)
print("Recall Macro:", recall_logistic_ma)
print("F1 Score Macro:", f1_logistic_ma)


# Print evaluation metrics for LinearSVC
print("\nLinearSVC:")
print("Precision Micro:", precision_linear_svc)
print("Recall Micro:", recall_linear_svc)
print("F1 Score Micro:", f1_linear_svc)
print("Precision Macro:", precision_linear_svc_ma)
print("Recall Macro:", recall_linear_svc_ma)
print("F1 Score Macro:", f1_linear_svc_ma)

Logistic Regression:
Precision Micro: 0.6865671641791045
Recall Micro: 0.6865671641791045
F1 Score Macro: 0.6865671641791045
Precision Macro: 0.45704779189352696
Recall Macro: 0.4817170702674553
F1 Score Macro: 0.468757172872857

LinearSVC:
Precision Micro: 0.7014925373134329
Recall Micro: 0.7014925373134329
F1 Score Micro: 0.7014925373134329
Precision Macro: 0.5567139282735614
Recall Macro: 0.5202781074893562
F1 Score Macro: 0.5259179395028452


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [138]:
#LOGISTIC & LINEARSVC ON CROSS VALIDATION SET WITH NO LEXICON.

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

#Vectorize the text data like hw
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train) # This should be a matrix
X_test_vectorized = vectorizer.transform(X_test) # This should be a matrix

import random
random.seed(1)



# Initialize the classifiers
logistic_classifier = LogisticRegression()
linear_svc_classifier = LinearSVC()

# Define parameter grids for hyperparameter tuning
logistic_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
linear_svc_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

# Initialize GridSearchCV for Logistic Regression
logistic_grid_search = GridSearchCV(logistic_classifier, logistic_param_grid, cv=5, scoring='accuracy')
logistic_grid_search.fit(X_train_vectorized, y_train)

# Initialize GridSearchCV for LinearSVC
linear_svc_grid_search = GridSearchCV(linear_svc_classifier, linear_svc_param_grid, cv=5, scoring='accuracy')
linear_svc_grid_search.fit(X_train_vectorized, y_train)

# Get best models
best_logistic_model = logistic_grid_search.best_estimator_
best_linear_svc_model = linear_svc_grid_search.best_estimator_

# Evaluate Logistic Regression model
logistic_preds = best_logistic_model.predict(X_test_vectorized)
precision_logistic = precision_score(y_test, logistic_preds, average='micro')
recall_logistic = recall_score(y_test, logistic_preds, average='micro')
f1_logistic = f1_score(y_test, logistic_preds, average='micro')
precision_logistic_ma = precision_score(y_test, logistic_preds, average='macro')
recall_logistic_ma = recall_score(y_test, logistic_preds, average='macro')
f1_logistic_ma = f1_score(y_test, logistic_preds, average='macro')

# Evaluate LinearSVC model
linear_svc_preds = best_linear_svc_model.predict(X_test_vectorized)
precision_linear_svc = precision_score(y_test, linear_svc_preds, average='micro')
recall_linear_svc = recall_score(y_test, linear_svc_preds, average='micro')
f1_linear_svc = f1_score(y_test, linear_svc_preds, average='micro')
precision_linear_svc_ma = precision_score(y_test, linear_svc_preds, average='macro')
recall_linear_svc_ma = recall_score(y_test, linear_svc_preds, average='macro')
f1_linear_svc_ma = f1_score(y_test, linear_svc_preds, average='macro')

# Print evaluation metrics for Logistic Regression
print("Logistic Regression:")
#validation_score_logistic=best_logistic_model
#print("Validation F1: {:.4f}", (validation_score_logistic))
#print("Validation F1: {:.4f}".format(validation_score_logistic))
print("Precision Micro:", precision_logistic)
print("Recall Micro:", recall_logistic)
print("F1 Score Micro:", f1_logistic)
print("Precision Macro:", precision_logistic_ma)
print("Recall Macro:", recall_logistic_ma)
print("F1 Score Macro:", f1_logistic_ma)


# Print evaluation metrics for LinearSVC
print("\nLinearSVC:")
#validation_score_svc_model=best_linear_svc_model
#print("Validation F1: {:.4f}", (validation_score_svc_model)), annoying error same as above
print("Precision Micro:", precision_linear_svc)
print("Recall Micro:", recall_linear_svc)
print("F1 Score Micro:", f1_linear_svc)
print("Precision Macro:", precision_linear_svc_ma)
print("Recall Macro:", recall_linear_svc_ma)
print("F1 Score Macro:", f1_linear_svc_ma)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression:
Precision Micro: 0.6567164179104478
Recall Micro: 0.6567164179104478
F1 Score Micro: 0.6567164179104478
Precision Macro: 0.4587795342512324
Recall Macro: 0.4509913328125581
F1 Score Macro: 0.4348321001707456

LinearSVC:
Precision Micro: 0.6417910447761194
Recall Micro: 0.6417910447761194
F1 Score Micro: 0.6417910447761194
Precision Macro: 0.42717497556207235
Recall Macro: 0.4466391399769371
F1 Score Macro: 0.43413559686257647


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [139]:
# WRITE CODE HERE

X_train_lexicon_features = [] # Initailze to an empty list. This will be a list of lists
X_test_lexicon_features = [] #  Initailze to an empty list. This will be a list of lists

# Loop over X_txt_test
#    for each string in X_txt_test (i.e., for each item in the list), pass it to LexiconClassifiers .count_pos_words() and count_neg_words method
#    append a list with the counts to X_test_lexicon_features []
c=ExtendedLexiconClassifier()

for row in X_test:
    tech_words_count = lexicon_classifier.count_tech_words(row)
    transport_words_count = lexicon_classifier.count_transport_words(row)
    exclamation_count = lexicon_classifier.count_exclamation_marks(row)
    capital_letter_count = lexicon_classifier.count_capital_letters(row)
    X_test_lexicon_features.append([tech_words_count,transport_words_count,exclamation_count,capital_letter_count])


for row in X_train:
    tech_words_count = lexicon_classifier.count_tech_words(row)
    transport_words_count = lexicon_classifier.count_transport_words(row)
    exclamation_count = lexicon_classifier.count_exclamation_marks(row)
    capital_letter_count = lexicon_classifier.count_capital_letters(row)
    X_train_lexicon_features.append([tech_words_count,transport_words_count,exclamation_count,capital_letter_count])

In [140]:
#OK NOW #LOGISTIC & LINEARSVC ON CROSSVALIDATION WITH LEXICON
import numpy as np
import scipy.sparse as sp
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

import random
random.seed(1)

# 1. Convert X_txt_train and X_txt_test to matricies of numbers (i.e., use CountVectorizer)
X_train_with_lex=vec.fit_transform(X_train)
X_test_with_lex=vec.transform(X_test)

# 2. Now we need to convert X_train_lexicon_features and X_test_lexicon_features to numpy arrays
X_train_lexicon_features_array=np.array(X_train_lexicon_features)
X_test_lexicon_features_array=np.array(X_test_lexicon_features)

#3. sp.hstack with arrays

X_train_with_lex=sp.hstack([X_train_with_lex,X_train_lexicon_features_array])
X_test_with_lex=sp.hstack([X_test_with_lex,X_test_lexicon_features_array])


# Initialize the classifiers
logistic_classifier_lex_cv = LogisticRegression()
linear_svc_classifier = LinearSVC()

# Define parameter grids for hyperparameter tuning
logistic_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
linear_svc_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

# Initialize GridSearchCV for Logistic Regression
logistic_grid_search = GridSearchCV(logistic_classifier_lex_cv, logistic_param_grid, cv=5, scoring='accuracy')
logistic_grid_search.fit(X_train_with_lex, y_train)

# Initialize GridSearchCV for LinearSVC
linear_svc_grid_search = GridSearchCV(linear_svc_classifier, linear_svc_param_grid, cv=5, scoring='accuracy')
linear_svc_grid_search.fit(X_train_with_lex, y_train)



best_logistic_model_cv = logistic_grid_search.best_estimator_
best_linear_svc_model = linear_svc_grid_search.best_estimator_

# Evaluate Logistic Regression model
logistic_preds_cv = best_logistic_model_cv.predict(X_test_with_lex)
precision_logistic = precision_score(y_test, logistic_preds_cv, average='micro')
recall_logistic = recall_score(y_test, logistic_preds_cv, average='micro')
f1_logistic = f1_score(y_test, logistic_preds_cv, average='micro')
precision_logistic_ma = precision_score(y_test, logistic_preds_cv, average='macro')
recall_logistic_ma = recall_score(y_test, logistic_preds_cv, average='macro')
f1_logistic_ma = f1_score(y_test, logistic_preds_cv, average='macro')



# Evaluate LinearSVC model
linear_svc_preds = best_linear_svc_model.predict(X_test_with_lex)
precision_linear_svc = precision_score(y_test, linear_svc_preds, average='micro')
recall_linear_svc = recall_score(y_test, linear_svc_preds, average='micro')
f1_linear_svc = f1_score(y_test, linear_svc_preds, average='micro')
precision_linear_svc_ma = precision_score(y_test, linear_svc_preds, average='macro')
recall_linear_svc_ma = recall_score(y_test, linear_svc_preds, average='macro')
f1_linear_svc_ma = f1_score(y_test, linear_svc_preds, average='macro')

# Print evaluation metrics for Logistic Regression
print("Logistic Regression:")
#validation_score_logistic = best_logistic_model 
#print("Validation F1: {:.4f}".format(best_logistic_model))
print("Precision Micro:", precision_logistic)
print("Recall Micro:", recall_logistic)
print("F1 Score Micro:", f1_logistic)
print("Precision Macro:", precision_logistic_ma)
print("Recall Macro:", recall_logistic_ma)
print("F1 Score Macro:", f1_logistic_ma)


# Print evaluation metrics for LinearSVC
print("\nLinearSVC:")
#validation_score_SVC=best_linear_svc_model
#print("Validation F1: {:.4f}".format(best_linear_svc_model))
print("Precision Micro:", precision_linear_svc)
print("Recall Micro:", recall_linear_svc)
print("F1 Score Micro:", f1_linear_svc)
print("Precision Macro:", precision_linear_svc_ma)
print("Recall Macro:", recall_linear_svc_ma)
print("F1 Score Macro:", f1_linear_svc_ma)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression:
Precision Micro: 0.6716417910447762
Recall Micro: 0.6716417910447762
F1 Score Micro: 0.6716417910447762
Precision Macro: 0.45103578154425605
Recall Macro: 0.46903247405423504
F1 Score Macro: 0.4586296056884292

LinearSVC:
Precision Micro: 0.6666666666666666
Recall Micro: 0.6666666666666666
F1 Score Micro: 0.6666666666666666
Precision Macro: 0.4477240896358543
Recall Macro: 0.46520105642971393
F1 Score Macro: 0.4548200895506285




In [141]:
# Create a DataFrame to store the results
results_df = pd.DataFrame({
    'Comment Text': X_test,  # Assuming X_test contains the comment text
    'Gold Label': y_test,  # Assuming y_test contains the gold labels
    'Predictions': logistic_preds_cv
})

# Export to Excel
results_df.to_excel('logistic_regression_results1.xlsx', index=False)


In [143]:
#LOGISTIC & LINEARSVC ON TEST WITH LEXICON

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer


import random
random.seed(1)


# 1. Convert X_txt_train and X_txt_test to matricies of numbers (i.e., use CountVectorizer)
X_train_with_lex=vec.fit_transform(X_train)
X_test_with_lex=vec.transform(X_test)

# 2. Now we need to convert X_train_lexicon_features and X_test_lexicon_features to numpy arrays
X_train_lexicon_features_array=np.array(X_train_lexicon_features)
X_test_lexicon_features_array=np.array(X_test_lexicon_features)

#3. sp.hstack with arrays
X_train_with_lex=sp.hstack([X_train_with_lex,X_train_lexicon_features_array])
X_test_with_lex=sp.hstack([X_test_with_lex,X_test_lexicon_features_array])

# Initialize the classifiers
logistic_classifier_lex_cv = LogisticRegression()
linear_svc_classifier = LinearSVC()

# Train and evaluate Logistic Regression classifier
logistic_classifier.fit(X_train_with_lex, y_train)
logistic_preds = logistic_classifier.predict(X_test_with_lex)

precision_logistic = precision_score(y_test, logistic_preds, average='micro')
recall_logistic = recall_score(y_test, logistic_preds, average='micro')
f1_logistic = f1_score(y_test, logistic_preds, average='micro')
precision_logistic_ma = precision_score(y_test, logistic_preds, average='macro')
recall_logistic_ma = recall_score(y_test, logistic_preds, average='macro')
f1_logistic_ma = f1_score(y_test, logistic_preds, average='macro')


# Train and evaluate LinearSVC classifier
linear_svc_classifier.fit(X_train_with_lex, y_train)
linear_svc_preds = linear_svc_classifier.predict(X_test_with_lex)

precision_linear_svc = precision_score(y_test, linear_svc_preds, average='micro')
recall_linear_svc = recall_score(y_test, linear_svc_preds, average='micro')
f1_linear_svc = f1_score(y_test, linear_svc_preds, average='micro')
precision_linear_svc_ma = precision_score(y_test, linear_svc_preds, average='macro')
recall_linear_svc_ma = recall_score(y_test, linear_svc_preds, average='macro')
f1_linear_svc_ma = f1_score(y_test, linear_svc_preds, average='macro')

# Print evaluation metrics for Logistic Regression
print("Logistic Regression:")
print("Precision Micro:", precision_logistic)
print("Recall Micro:", recall_logistic)
print("F1 Score Macro:", f1_logistic)
print("Precision Macro:", precision_logistic_ma)
print("Recall Macro:", recall_logistic_ma)
print("F1 Score Macro:", f1_logistic_ma)


# Print evaluation metrics for LinearSVC
print("\nLinearSVC:")
print("Precision Micro:", precision_linear_svc)
print("Recall Micro:", recall_linear_svc)
print("F1 Score Micro:", f1_linear_svc)
print("Precision Macro:", precision_linear_svc_ma)
print("Recall Macro:", recall_linear_svc_ma)
print("F1 Score Macro:", f1_linear_svc_ma)

Logistic Regression:
Precision Micro: 0.6965174129353234
Recall Micro: 0.6965174129353234
F1 Score Macro: 0.6965174129353234
Precision Macro: 0.5781076406785096
Recall Macro: 0.5164466898648351
F1 Score Macro: 0.5239626725475782

LinearSVC:
Precision Micro: 0.6716417910447762
Recall Micro: 0.6716417910447762
F1 Score Micro: 0.6716417910447762
Precision Macro: 0.5611111111111111
Recall Macro: 0.4984799450818868
F1 Score Macro: 0.5064431030628214


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
