In [8]:
import pandas as pd

# Build a logistic regression model that detects if a string of text is a main genre 
electric = pd.read_csv('electric.csv', header=None)
electric['is_electric'] = 1
electric.rename(columns={0: 'electric_or_fossil'}, inplace=True)

# Import a list of sub genres to use as a feature in the model
fossil_fuel = pd.read_csv('fossil_fuel.csv', header=None)
fossil_fuel['is_electric'] = 0
fossil_fuel.rename(columns={0: 'electric_or_fossil'}, inplace=True)

# Concatenate both the main genre and sub genre into one dataframe
fossil_electric = pd.concat([electric, fossil_fuel])

# Reset index after concatenation
fossil_electric.reset_index(drop=True, inplace=True)

print(fossil_electric)

              electric_or_fossil  is_electric
0                       electric            1
1          Hyundai Kona Electric            1
2                        Kia EV6            1
3                       MG ZS EV            1
4                         BMW i4            1
..                           ...          ...
205     Mercedes-Benz AMG A 45 S            0
206  BMW 3 Series Gran Limousine            0
207               MG Hector Plus            0
208                   Audi RS Q8            0
209         Maruti Alto 800 tour            0

[210 rows x 2 columns]


In [9]:
# ensure there is no NaN values in the electric_or_fossil column
fossil_electric = fossil_electric[pd.notnull(fossil_electric['electric_or_fossil'])]

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest classifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(fossil_electric['electric_or_fossil'], fossil_electric['is_electric'], test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of trees (n_estimators)
model.fit(X_train_vect, y_train)

# Predict on the test set
y_pred = model.predict(X_test_vect)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Serialize the model
joblib.dump(model,'/Users/Lauren/Desktop/fossil_electric_rf_model.joblib')

# Serialize the TF-IDF vectorizer as well
joblib.dump(vectorizer,'/Users/Lauren/Desktop/fossil_electric_vectorizer.joblib')


Accuracy: 0.9285714285714286

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        38
           1       0.67      0.50      0.57         4

    accuracy                           0.93        42
   macro avg       0.81      0.74      0.77        42
weighted avg       0.92      0.93      0.92        42



['/Users/Lauren/Desktop/fossil_electric_vectorizer.joblib']

In [13]:
['/Users/Lauren/Desktop/fossil_electric_vectorizer.joblib']

['/Users/Lauren/Desktop/fossil_electric_vectorizer.joblib']