In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score

### Read File

In [2]:
df1 = pd.read_excel("Orders.xlsx")
df2 = pd.read_excel("Bill.xlsx")
column_names_df1 = list(df1.columns)
df1

Unnamed: 0,JurisType,JurisTypeName,State,Classification Number,Type,Description,Link,Passage Date,Category Types,Country
0,CITY,Portland,Maine,Order 1-21-22 (PDF),Order,Order Granting a Thames Street Extension Licen...,https://content.civicplus.com/api/assets/95133...,2021-07-19 00:00:00,Infrastructure,USA
1,CITY,Portland,Maine,Order 2-21-22 (PDF),Order,Order Accepting the Public Art Committee Fisca...,https://content.civicplus.com/api/assets/220d1...,2021-07-19 00:00:00,Infrastructure,USA
2,CITY,Portland,Maine,Order 3-21-22 (PDF),Order,Order Granting Municipal Officers’ Approval of...,https://content.civicplus.com/api/assets/84fbd...,2021-07-19 00:00:00,Safety,USA
3,CITY,Portland,Maine,Order 4-21-22 (PDF),Order,Order Granting Municipal Officers’ Approval of...,https://content.civicplus.com/api/assets/590d7...,2021-07-19 00:00:00,Zoning,USA
4,CITY,Portland,Maine,Order 5-21-22 (PDF),Order,Order Granting Municipal Officers’ Approval of...,https://content.civicplus.com/api/assets/167ff...,2021-07-19 00:00:00,Safety,USA
...,...,...,...,...,...,...,...,...,...,...
531,CITY,Portland,Maine,Order 146-23-24 (PDF),Order,Granting municipal officers’ approval of Rocke...,https://content.civicplus.com/api/assets/b9b60...,3/18/2024,Community,USA
532,CITY,Portland,Maine,Order 147-23-24 (PDF),Order,Granting municipal officers’ approval of Anoth...,https://content.civicplus.com/api/assets/9317f...,3/18/2024,Community,USA
533,CITY,Portland,Maine,Order 148-23-24 (PDF),Order,Granting municipal officers’ approval of 15 Ex...,https://content.civicplus.com/api/assets/2125b...,3/18/2024,Community,USA
534,CITY,Portland,Maine,Order 149-23-24 (PDF),Order,Accepting and adopting the 2024 Jill C. Duson ...,https://content.civicplus.com/api/assets/46a3f...,3/18/2024,Housing,USA


### Data Cleaning

In [3]:
column_names_df1

['JurisType',
 'JurisTypeName',
 'State',
 'Classification Number ',
 'Type',
 'Description ',
 'Link',
 'Passage Date',
 'Category Types',
 'Country']

### Extracting address from df1 --- description

In [4]:
import re

def extract_address(text):
    # Define a regex pattern for an address
    #print(text)
    regexp = "[0-9]{1,3} .+ (?:Street|St|St.|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln|Drive|Dr|Court|Ct|Square|Sq|Trail|Trl|Parkway|Pkwy|Circle|Cir)"
    
    address = re.findall(regexp, text)
    #print(address)
    if (len(address) > 0):
        address = address[0]
    else:
        address = ""

    return address

df1['Address_Extract'] = df1[column_names_df1[2]].apply(extract_address)
df1.to_excel("address_extract.xlsx")
#df1[column_names_df1[2]][2]
#extract_address(df1[column_names_df1[2]][2])
#print(extract_address('Order Granting Municipal Officers’ Approval of On the Rocks Cocktail Cruises LLC, dba On the Rocks Cocktail Cruises. Application for a Class I FSE at 68 Commercial Street. '))

In [5]:
txt = 'Order Granting Municipal Officers’ Approval of On the Rocks Cocktail Cruises LLC, dba On the Rocks Cocktail Cruises. Application for a Class I FSE at 68 Commercial Street. '
regexp = "[0-9]{1,3} .+ (?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln|Drive|Dr|Court|Ct|Square|Sq|Trail|Trl|Parkway|Pkwy|Circle|Cir)"
address = re.findall(regexp, txt)
address

['68 Commercial Street']

In [6]:
df1[column_names_df1[2]]

0      Maine
1      Maine
2      Maine
3      Maine
4      Maine
       ...  
531    Maine
532    Maine
533    Maine
534    Maine
535    Maine
Name: State, Length: 536, dtype: object

### tokenize description

In [7]:
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from nltk.corpus import stopwords

In [8]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) #remove digits
    words = word_tokenize(text)
    return words

In [9]:
base_df = pd.DataFrame({
    'Description' : df1['Description '].replace(to_replace=r'[^\w\s]', value='', regex=True).apply(tokenize), 
    'Class' : df1['Category Types'].str.lower()
})
base_df2 = pd.DataFrame({
    'Description' : df2['Description '].replace(to_replace=r'[^\w\s]', value='', regex=True).apply(tokenize), 
})
#remove stop words
stop_words = set(stopwords.words('english'))
base_df['Description'] = base_df['Description'].apply(lambda x: [word for word in x if word not in stop_words])
base_df

Unnamed: 0,Description,Class
0,"[order, granting, thames, street, extension, l...",infrastructure
1,"[order, accepting, public, art, committee, fis...",infrastructure
2,"[order, granting, municipal, officers, approva...",safety
3,"[order, granting, municipal, officers, approva...",zoning
4,"[order, granting, municipal, officers, approva...",safety
...,...,...
531,"[granting, municipal, officers, approval, rock...",community
532,"[granting, municipal, officers, approval, anot...",community
533,"[granting, municipal, officers, approval, exch...",community
534,"[accepting, adopting, jill, c, duson, housing,...",housing


In [10]:
base_df2['Description'] = base_df2['Description'].apply(lambda x: [word for word in x if word not in stop_words])
base_df2

Unnamed: 0,Description
0,"[act, expand, health, insurance, options, chil..."
1,"[act, amend, mining, excise, tax, laws]"
2,"[act, protect, victims, domestic, abuse, viole..."
3,"[act, require, health, insurance, coverage, bi..."
4,"[act, regarding, recommendations, changing, pl..."
...,...
923,"[act, amend, laws, governing, invasive, aquati..."
924,"[act, implement, recommendations, probate, tru..."
925,"[act, clarify, boundary, waldo, knox, counties..."
926,"[act, update, reimbursement, travelrelated, ex..."


### Stemming

In [11]:
from nltk.stem import PorterStemmer

In [12]:
stemmer = PorterStemmer()
def stem_words(words):
    return [stemmer.stem(word) for word in words]
base_df['Description'] = base_df['Description'].apply(stem_words)
base_df

Unnamed: 0,Description,Class
0,"[order, grant, thame, street, extens, licens, ...",infrastructure
1,"[order, accept, public, art, committe, fiscal,...",infrastructure
2,"[order, grant, municip, offic, approv, rock, c...",safety
3,"[order, grant, municip, offic, approv, hi, fid...",zoning
4,"[order, grant, municip, offic, approv, tokyo, ...",safety
...,...,...
531,"[grant, municip, offic, approv, rocket, skate,...",community
532,"[grant, municip, offic, approv, anoth, round, ...",community
533,"[grant, municip, offic, approv, exchang, llc, ...",community
534,"[accept, adopt, jill, c, duson, hous, trust, f...",housing


In [13]:
base_df2['Description'] = base_df2['Description'].apply(stem_words)
base_df2

Unnamed: 0,Description
0,"[act, expand, health, insur, option, child, ca..."
1,"[act, amend, mine, excis, tax, law]"
2,"[act, protect, victim, domest, abus, violenc, ..."
3,"[act, requir, health, insur, coverag, biomark,..."
4,"[act, regard, recommend, chang, place, name, s..."
...,...
923,"[act, amend, law, govern, invas, aquat, plant]"
924,"[act, implement, recommend, probat, trust, law..."
925,"[act, clarifi, boundari, waldo, knox, counti, ..."
926,"[act, updat, reimburs, travelrel, expens, incu..."


### Lemmatization

In [14]:
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [15]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmas

base_df['Description'] = base_df['Description'].apply(lemmatize_tokens)
base_df

Unnamed: 0,Description,Class
0,"[order, grant, thame, street, extens, licens, ...",infrastructure
1,"[order, accept, public, art, committe, fiscal,...",infrastructure
2,"[order, grant, municip, offic, approv, rock, c...",safety
3,"[order, grant, municip, offic, approv, hi, fid...",zoning
4,"[order, grant, municip, offic, approv, tokyo, ...",safety
...,...,...
531,"[grant, municip, offic, approv, rocket, skate,...",community
532,"[grant, municip, offic, approv, anoth, round, ...",community
533,"[grant, municip, offic, approv, exchang, llc, ...",community
534,"[accept, adopt, jill, c, duson, hous, trust, f...",housing


In [16]:
base_df2['Description'] = base_df2['Description'].apply(lemmatize_tokens)
base_df2

Unnamed: 0,Description
0,"[act, expand, health, insur, option, child, ca..."
1,"[act, amend, mine, excis, tax, law]"
2,"[act, protect, victim, domest, abus, violenc, ..."
3,"[act, requir, health, insur, coverag, biomark,..."
4,"[act, regard, recommend, chang, place, name, s..."
...,...
923,"[act, amend, law, govern, invas, aquat, plant]"
924,"[act, implement, recommend, probat, trust, law..."
925,"[act, clarifi, boundari, waldo, knox, counti, ..."
926,"[act, updat, reimburs, travelrel, expens, incu..."


In [17]:
base_df["Class"].value_counts()

Class
infrastructure             112
community                   83
governance                  64
safety                      62
housing                     35
zoning                      30
education                   21
environmental               20
finance                     17
public safety               16
human resources             11
legislation                 11
administration              10
arts & culture               7
legal                        6
employment                   5
fiscal                       4
public services              4
transportation               4
health & human services      3
real estate                  3
social services              3
community services           2
energy                       2
environment                  1
Name: count, dtype: int64

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [19]:
base_df = base_df.groupby('Class').filter(lambda x: len(x) >= 10)
base_df

Unnamed: 0,Description,Class
0,"[order, grant, thame, street, extens, licens, ...",infrastructure
1,"[order, accept, public, art, committe, fiscal,...",infrastructure
2,"[order, grant, municip, offic, approv, rock, c...",safety
3,"[order, grant, municip, offic, approv, hi, fid...",zoning
4,"[order, grant, municip, offic, approv, tokyo, ...",safety
...,...,...
531,"[grant, municip, offic, approv, rocket, skate,...",community
532,"[grant, municip, offic, approv, anoth, round, ...",community
533,"[grant, municip, offic, approv, exchang, llc, ...",community
534,"[accept, adopt, jill, c, duson, hous, trust, f...",housing


In [20]:
base_df["Class"].value_counts()

Class
infrastructure     112
community           83
governance          64
safety              62
housing             35
zoning              30
education           21
environmental       20
finance             17
public safety       16
human resources     11
legislation         11
administration      10
Name: count, dtype: int64

### First round implementation

In [21]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(base_df['Description'].apply(lambda x: ' '.join(x)))

# Encode the class labels
encoder = LabelEncoder()
y = encoder.fit_transform(base_df['Class'])

In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Naive Bayes

In [22]:
model = MultinomialNB().fit(X_train, y_train)

In [23]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.6842105263157895

#### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [25]:
model = LogisticRegression().fit(X_train, y_train)

In [26]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.7986270022883295

#### SVM

In [27]:
from sklearn.svm import SVC

In [28]:
model = SVC().fit(X_train, y_train)

In [29]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.7665903890160183

#### Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

In [32]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.7780320366132724

#### Neural Network(LSTM)

In [33]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [34]:
loss_lst = ["BinaryCrossentropy", "BinaryFocalCrossentropy", 
            "CTC", "CategoricalCrossentropy", "CategoricalFocalCrossentropy", 
            "CategoricalHinge", "CosineSimilarity", "Dice", "Hinge", "Huber", 
            "KLDivergence", "Loss", "MeanAbsoluteError", "MeanAbsolutePercentageError", 
            "MeanSquaredError", "MeanSquaredLogarithmicError", "Poisson", "Reduction", 
            "SparseCategoricalCrossentropy", "SquaredHinge", "Tversky"
           ]

In [35]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(base_df['Description'])
X = tokenizer.texts_to_sequences(base_df['Description'])
X = pad_sequences(X, maxlen=100)

encoder = LabelEncoder()
y = encoder.fit_transform(base_df['Class'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='CategoricalHinge', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Accuracy: {accuracy:.2f}")

Epoch 1/5




6/6 - 3s - 482ms/step - accuracy: 0.1570 - loss: 0.1080 - val_accuracy: 0.1824 - val_loss: 0.1037
Epoch 2/5
6/6 - 0s - 56ms/step - accuracy: 0.1628 - loss: 0.0730 - val_accuracy: 0.1824 - val_loss: 0.0455
Epoch 3/5
6/6 - 0s - 54ms/step - accuracy: 0.1628 - loss: 0.0432 - val_accuracy: 0.1824 - val_loss: 0.0412
Epoch 4/5
6/6 - 0s - 53ms/step - accuracy: 0.1628 - loss: 0.0411 - val_accuracy: 0.1824 - val_loss: 0.0407
Epoch 5/5
6/6 - 0s - 53ms/step - accuracy: 0.1628 - loss: 0.0408 - val_accuracy: 0.1824 - val_loss: 0.0406
Accuracy: 0.18


#### KNN

In [36]:
from sklearn.neighbors import NearestNeighbors
k = 2  # Number of neighbors
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
nn.fit(X_train)  # NearestNeighbors requires dense input if using cosine

# Finding the k-nearest neighbors for each point in the test set
distances, indices = nn.kneighbors(X_test)

# Voting logic to classify based on the nearest neighbors
y_pred = []
for index_array in indices:
    neighbor_labels = [y_train[idx] for idx in index_array]
    # Get the most common class label among the nearest neighbors
    most_common = max(set(neighbor_labels), key=neighbor_labels.count)
    y_pred.append(most_common)

# Compare the predictions with the actual labels
print("Predicted labels:", y_pred)
print("Actual labels:", y_test)
#calculate accuracy
count = 0
for i in range(len(y_pred)):
    if (y_pred[i] == y_test[i]):
        count += 1
print ("Accuracy:", count / len(y_pred))

Predicted labels: [4, 3, 5, 11, 8, 2, 11, 8, 1, 8, 2, 12, 8, 8, 8, 5, 8, 5, 8, 3, 2, 1, 1, 4, 8, 1, 8, 1, 8, 5, 11, 1, 8, 8, 8, 8, 3, 4, 8, 1, 8, 1, 1, 11, 1, 1, 8, 11, 8, 8, 8, 11, 1, 8, 8, 8, 11, 8, 6, 5, 1, 8, 11, 3, 8, 5, 8, 8, 8, 8, 1, 8, 1, 3, 5, 5, 5, 8, 11, 1, 12, 8, 8, 10, 8, 8, 2, 3, 1, 1, 8, 11, 1, 8, 8, 8, 1, 3, 8, 0, 12, 8, 1, 1, 1, 1, 8, 10, 1, 8, 9, 8, 11, 1, 9, 1, 1, 12, 8, 8, 8, 10, 11, 8, 11, 10, 4, 2, 8, 11, 8, 1, 8, 8, 8, 8, 1, 8, 4, 6, 9, 0, 1, 8, 8, 12, 8, 8]
Actual labels: [ 8 11  5 10 11  2 11  8  3  6  2  5 11 11  5 11  8  0 12  1  4  0  1  4
  8  1  5 11  1  5  8  6  3  5 11  4  8  8  8  1  2  8  8 11 11  1  8  5
 11  1  1 11  1 11  1  8  1  1 11 12  8  6  1  8  6 12 12  5  5  3  1  0
  5  6  8  8  5  1  2  9 12  8  6  5  5  6  1  8 12  6 11  2  1 12  2  6
 11  5  9  8 11  1  7  1  3  5  1  1  1  8  9  2 11  3  9 10  5  6  2  8
  2  8 11 11 10  8  4  2  8  5  8  3  1  8  8  5 11  1  4 11  9  1  1 12
  8 11  8  1]
Accuracy: 0.2905405405405405


### N gram model -- feature engineering

In [39]:
vectorizer = CountVectorizer(ngram_range=(2, 2))  
X = vectorizer.fit_transform(base_df['Description'].apply(lambda x: ' '.join(x)))
# Encode the class labels
encoder = LabelEncoder()
y = encoder.fit_transform(base_df['Class'])

smote = SMOTE()
X, y = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [42]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6247139588100686

### TF-IDF model -- feature engineering

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(base_df['Description'].apply(lambda x: ' '.join(x)))
X_final = vectorizer.fit_transform(base_df2['Description'].apply(lambda x: ' '.join(x)))
# Encode the class labels
encoder = LabelEncoder()
y = encoder.fit_transform(base_df['Class'])

smote = SMOTE()
X, y = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


#### Logistic Regression

In [30]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [31]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9107551487414187

#### Logistic Regression parameter adjustment

In [50]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, make_scorer

In [58]:
model = LogisticRegression(max_iter = 1000)

# Define the parameter grid
param_grid = {
    'solver': ["newton-cg","sag", "saga", "lbfgs"],  # Suitable for small datasets and binary classification
    'penalty': ['l1', 'l2'],  # Trying both L1 and L2 regularization
    'C': [0.01, 0.1, 1, 10]
}

# Setup the grid search with 10-fold cross-validation
cv = StratifiedKFold(n_splits=10)  # Ensuring each fold is a good representative of the whole
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy')

# Fit grid search
grid_search.fit(X_train, y_train)

# Extract results
results = grid_search.cv_results_
max_accuracy = max(results['mean_test_score'])
print("Accuracy:", max_accuracy)
print("Parameters:", grid_search.best_params_)



KeyboardInterrupt: 

#### Naive Bayes

In [63]:
model = MultinomialNB().fit(X_train, y_train)

In [64]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.8489702517162472

#### SVM

In [59]:
model = SVC(random_state = 42).fit(X_train, y_train)

In [60]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.9061784897025171

#### parameter adjustment

In [64]:
model = SVC(max_iter = 1000)

# Define the parameter grid
param_grid = {
    'gamma' : ['scale', 'auto'], 
    'tol' : np.arange(0.00001, 0.001, 0.00001), 
}

# Setup the grid search with 10-fold cross-validation
cv = StratifiedKFold(n_splits=10)  # Ensuring each fold is a good representative of the whole
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy')

# Fit grid search
grid_search.fit(X_train, y_train)

# Extract results
results = grid_search.cv_results_
max_accuracy = max(results['mean_test_score'])
print("Accuracy:", max_accuracy)
print("Parameters:", grid_search.best_params_)

Accuracy: 0.9382664147378366
Parameters: {'gamma': 'scale', 'tol': 1e-05}


#### final model

In [65]:
svm_model = SVC(max_iter = 1000, gamma = "scale", tol = 1e-05, random_state = 42).fit(X_train, y_train)

In [66]:
predicted_labels = svm_model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.9061784897025171

#### underfitting or overfitting

In [67]:
predicted_labels = svm_model.predict(X_train)
accuracy = metrics.accuracy_score(y_train, predicted_labels)
accuracy

0.9842983316977428

#### Random Forest

In [84]:
model_rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

In [85]:
predicted_labels = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted_labels)
accuracy

0.9016018306636155

#### parameter adjustment

In [78]:
n_estimators_lst = []
max_depth_lst = []
max_leaf_nodes_lst = []
accuracy_lst = []
count = 1
for n in range (100, 201, 20):
    for depth in range (2, 20):
        for leaf in range (2, 20):
            if (count % 10 == 0):
                print (count)
            count += 1
            model = RandomForestClassifier(n_estimators = n, max_depth = depth, max_leaf_nodes = leaf, random_state = 42)
            cv_scores = cross_val_score(model, X, y, cv=10)
            n_estimators_lst.append(n)
            max_depth_lst.append(depth)
            max_leaf_nodes_lst.append(leaf)
            accuracy_lst.append(np.mean(cv_scores))
    

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300
1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500
1510
1520
1530
1540
1550
1560
1570
1580
1590
1600
1610
1620
1630
1640
1650
1660
1670
1680
1690
1700
1710
1720
1730
1740
1750
1760
1770
1780
1790
1800
1810
1820
1830
1840
1850
1860
1870
1880
1890
1900
1910
1920
1930
1940


In [79]:
rf_df = pd.DataFrame({
    "n_estimators" : n_estimators_lst, 
    "max_depth" : max_depth_lst, 
    "max_leaf_nodes" : max_leaf_nodes_lst, 
    "accuracy" : accuracy_lst
})
rf_df = rf_df.sort_values(by = ["accuracy"], ascending = [False])
rf_df.to_csv("rf_df.csv")
rf_df

Unnamed: 0,n_estimators,max_depth,max_leaf_nodes,accuracy
1727,200,7,19,0.798838
1097,160,8,19,0.797473
1079,160,7,19,0.797463
1133,160,10,19,0.795418
1781,200,10,19,0.795413
...,...,...,...,...
414,120,7,2,0.614095
396,120,6,2,0.614095
378,120,5,2,0.614095
360,120,4,2,0.614095


#### overfitting / underfitting

In [80]:
model = RandomForestClassifier(n_estimators = n, max_depth = depth, max_leaf_nodes = leaf, random_state = 42).fit(X_train, y_train)
y_pred = model.predict(X_train)
accuracy = metrics.accuracy_score(y_train, y_pred)
accuracy

0.8361138370951914

In [176]:
from sklearn.metrics import confusion_matrix, recall_score
model = RandomForestClassifier(n_estimators = n, max_depth = depth, max_leaf_nodes = leaf, random_state = 42).fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
#accuracy
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]
FN = cm[1, 0]

# Calculate sensitivity
sensitivity = TP / (TP + FN)
sensitivity

1.0

In [86]:
model_rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
cv_scores = cross_val_score(model, X, y, cv=10)
np.mean(cv_scores)

0.9211431270666036

### use final model to predict

In [32]:
final_pred = classifier.predict(X_final[:,2:1029])
final_pred

array([ 1,  8,  8,  4,  8,  8,  8,  1,  3,  8,  3, 12,  8,  8,  8,  8,  8,
        1,  8,  1,  1,  8, 10,  1,  1,  1, 10,  8,  8,  1,  8,  8, 11,  1,
        8, 10,  1, 11,  8, 12,  8,  8,  1,  8,  8,  8,  8, 10,  1,  1,  1,
       12,  8, 12,  8,  1,  8,  8,  1,  8,  8,  1, 12,  8,  1, 12,  1,  8,
        1,  6,  1,  1, 11,  8,  1,  1,  1,  8,  1,  8,  8,  8,  1,  8,  8,
        8,  8,  8, 12,  3, 11,  1,  8,  1,  8,  1, 12,  1,  8,  1, 12,  8,
        8,  8,  1,  1,  1,  8,  1,  1,  8,  8,  8,  8,  8,  8,  1,  1,  1,
       10,  1,  8,  1,  1,  8,  8,  1,  8,  1,  8,  6,  8,  8, 12,  1,  8,
       12,  8,  8,  1,  1, 12, 10,  8,  8,  1,  1, 11,  8,  8,  8,  8,  1,
        1,  1,  8,  1,  8,  8,  8,  8,  1,  3, 10,  8,  1,  8,  8,  8,  1,
        8,  8,  5,  1,  1,  8,  1,  8,  8,  1,  8,  1,  1,  5,  1,  1,  1,
        5,  8,  1,  1,  1,  1,  8,  1,  8,  1,  8, 11,  8,  1,  1,  1,  8,
        1,  1,  1,  8, 12,  8,  8,  8,  8, 11,  1,  6,  1,  1, 12,  8,  1,
        8,  1, 10,  8,  1

In [33]:
df2.reset_index(drop=True, inplace=True)
#final_pred.reset_index(drop=True, inplace=True)
df2['Category Types'] = list(encoder.inverse_transform(final_pred))
df2.to_csv("new_df2.csv")

In [160]:
X_final.shape[0]

492

In [158]:
df2['Category Types']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
923   NaN
924   NaN
925   NaN
926   NaN
927   NaN
Name: Category Types, Length: 928, dtype: float64

In [167]:
pd.DataFrame(X_test)

Unnamed: 0,0
0,"(0, 83)\t0.009490899976104767\n (0, 231)\t0..."
1,"(0, 347)\t0.0717112251874176\n (0, 62)\t0.0..."
2,"(0, 1001)\t0.4714018667072575\n (0, 424)\t0..."
3,"(0, 149)\t0.12681261954079942\n (0, 21)\t0...."
4,"(0, 627)\t0.12736300042116153\n (0, 62)\t0...."
...,...
432,"(0, 136)\t0.4251433515438467\n (0, 958)\t0...."
433,"(0, 186)\t0.14346710639962854\n (0, 597)\t0..."
434,"(0, 62)\t0.03098285517981942\n (0, 58)\t0.0..."
435,"(0, 82)\t0.06096506732369721\n (0, 139)\t0...."


In [34]:
df2['Category Types'].value_counts()

Category Types
infrastructure     411
community          360
zoning              54
public safety       34
environmental       23
safety              20
governance          12
housing              9
finance              2
human resources      1
education            1
administration       1
Name: count, dtype: int64

#### KNN

In [69]:
from sklearn.neighbors import NearestNeighbors
k = 2  # Number of neighbors
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
nn.fit(X_train)  # NearestNeighbors requires dense input if using cosine

# Finding the k-nearest neighbors for each point in the test set
distances, indices = nn.kneighbors(X_test)

# Voting logic to classify based on the nearest neighbors
y_pred = []
for index_array in indices:
    neighbor_labels = [y_train[idx] for idx in index_array]
    # Get the most common class label among the nearest neighbors
    most_common = max(set(neighbor_labels), key=neighbor_labels.count)
    y_pred.append(most_common)

# Compare the predictions with the actual labels
print("Predicted labels:", y_pred)
print("Actual labels:", y_test)
#calculate accuracy
count = 0
for i in range(len(y_pred)):
    if (y_pred[i] == y_test[i]):
        count += 1
print ("Accuracy:", count / len(y_pred))

Predicted labels: [0, 10, 8, 7, 7, 1, 9, 9, 9, 4, 7, 1, 7, 0, 4, 7, 5, 7, 5, 0, 7, 8, 2, 4, 9, 2, 8, 12, 10, 2, 2, 0, 5, 8, 9, 2, 2, 4, 7, 8, 1, 5, 11, 8, 2, 1, 1, 11, 5, 3, 10, 9, 10, 5, 7, 8, 4, 5, 3, 10, 11, 12, 8, 7, 11, 3, 8, 1, 9, 4, 7, 5, 1, 5, 9, 2, 4, 1, 10, 5, 11, 0, 8, 6, 5, 0, 11, 7, 3, 6, 4, 10, 3, 2, 10, 8, 11, 7, 9, 2, 6, 7, 9, 8, 1, 6, 2, 12, 3, 6, 2, 9, 7, 0, 8, 0, 10, 4, 9, 4, 0, 0, 12, 12, 6, 0, 3, 8, 4, 7, 0, 8, 9, 10, 9, 3, 6, 9, 4, 5, 1, 7, 8, 4, 9, 8, 5, 10, 8, 7, 4, 3, 0, 5, 2, 12, 9, 5, 0, 5, 3, 10, 0, 0, 11, 7, 8, 3, 10, 1, 5, 3, 3, 1, 12, 12, 4, 10, 12, 7, 11, 12, 11, 8, 6, 1, 7, 12, 7, 9, 6, 2, 10, 4, 7, 0, 7, 11, 11, 9, 7, 3, 0, 7, 12, 11, 8, 6, 5, 0, 12, 5, 8, 12, 8, 12, 1, 4, 4, 10, 6, 2, 1, 3, 10, 7, 6, 12, 7, 10, 6, 5, 0, 7, 6, 8, 1, 0, 6, 0, 3, 11, 8, 1, 9, 0, 5, 2, 1, 11, 12, 12, 6, 9, 9, 3, 9, 2, 9, 7, 3, 2, 3, 6, 8, 9, 4, 11, 7, 1, 10, 10, 8, 3, 10, 1, 0, 7, 4, 7, 12, 12, 6, 1, 3, 4, 2, 1, 2, 12, 2, 3, 8, 7, 2, 10, 0, 8, 8, 12, 8, 5, 12, 1, 7, 9, 9,

#### LSTM

In [59]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(base_df['Description'])
X = tokenizer.texts_to_sequences(base_df['Description'])
X = pad_sequences(X, maxlen=100)

encoder = LabelEncoder()
y = encoder.fit_transform(base_df['Class'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='CategoricalHinge', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Accuracy: {accuracy:.2f}")

Epoch 1/5




6/6 - 3s - 463ms/step - accuracy: 0.1483 - loss: 0.1082 - val_accuracy: 0.1824 - val_loss: 0.1036
Epoch 2/5
6/6 - 0s - 47ms/step - accuracy: 0.1628 - loss: 0.0734 - val_accuracy: 0.1824 - val_loss: 0.0466
Epoch 3/5
6/6 - 0s - 46ms/step - accuracy: 0.1628 - loss: 0.0441 - val_accuracy: 0.1824 - val_loss: 0.0416
Epoch 4/5
6/6 - 0s - 49ms/step - accuracy: 0.1628 - loss: 0.0414 - val_accuracy: 0.1824 - val_loss: 0.0409
Epoch 5/5
6/6 - 0s - 45ms/step - accuracy: 0.1628 - loss: 0.0409 - val_accuracy: 0.1824 - val_loss: 0.0407
Accuracy: 0.18


### Word2Vec model -- feature engineering

In [46]:
from gensim.models import Word2Vec

In [47]:
train, test = train_test_split(base_df, test_size=0.2, random_state=42)
model_w2v = Word2Vec(sentences=train['Description'], vector_size=100, window=5, min_count=1, workers=4)
import numpy as np

def document_vector(doc):
    doc = [word for word in doc if word in model_w2v.wv.key_to_index]
    return np.mean(model_w2v.wv[doc], axis=0) if doc else np.zeros(model_w2v.vector_size)

train_vectors = np.array([document_vector(doc) for doc in train['Description']])
test_vectors = np.array([document_vector(doc) for doc in test['Description']])

In [48]:
train_vectors_smote, train_labels_smote = smote.fit_resample(train_vectors, train['Class'])

In [49]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(train_vectors_smote, train_labels_smote)

predictions = logreg.predict(test_vectors)

accuracy = accuracy_score(test['Class'], predictions)
accuracy

0.24242424242424243