In [112]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [113]:
columns_to_keep = ['Asian', 'Black', 'White', 'outcomeUnsuitableForSearch', 'Female', 'Male', 'Year', 'Borough name']
df = pd.read_excel("ethnic-group-by-borough.xlsx")
df.dropna(subset=['Code'], inplace=True)
df2 = pd.read_csv("prepared_search.csv", usecols=columns_to_keep)

In [114]:
# Cleaning and Renaming
df.dropna(subset=['Code'], inplace=True)
london_boroughs = ['Barking and Dagenham', 'Barnet', 'Bexley', 'Brent', 'Bromley', 'Camden', 'Croydon', 'Ealing','Enfield', 'Greenwich', 'Hackney', 'Hammersmith and Fulham', 'Haringey', 'Harrow', 'Havering','Hillingdon', 'Hounslow', 'Islington', 'Kensington and Chelsea', 'Kingston upon Thames', 'Lambeth', 'Lewisham', 'Merton', 'Newham', 'Redbridge', 'Richmond upon Thames', 'Southwark', 'Sutton', 'Tower Hamlets','Waltham Forest', 'Wandsworth', 'Westminster']
london_boroughs = [borough.lower() for borough in london_boroughs]
df = df[df['Area'].str.lower().isin(london_boroughs)]
df.rename(columns={'Number': 'White_population', 'Unnamed: 3': 'Asian_population', 'Unnamed: 4': 'Black_population', 'Unnamed: 5': 'Other_population', 'Unnamed: 6' : 'Total_population', 'Area': 'Borough', 'Unnamed: 13': 'Year'}, inplace=True)
columns_keep = ['Code', 'Borough', 'White_population', 'Asian_population', 'Black_population', 'Other_population', 'Total_population', 'Year']
df = df[columns_keep]
df.drop(columns=['Code'], inplace=True)
df.dropna(how='all', inplace=True)
columns_to_convert = ['White_population', 'Black_population', 'Other_population', 'Asian_population']
df[columns_to_convert] = df[columns_to_convert].astype(int)
# df[['Total_population']] = df[['Total_population']].astype(int)
for col in columns_to_convert:
    df[col] = df[col] / df['Total_population']

df.drop(columns=['Total_population', 'Other_population'], inplace=True)

In [115]:
df2.rename(columns={'Borough name': 'Borough'}, inplace=True)

In [116]:
df['Borough'] = df['Borough'].str.lower()
df2['Borough'] = df2['Borough'].str.lower()
df['Year'] = df['Year'].astype(int)
df2['Year'] = df2['Year'].astype(int)
merged_df = pd.merge(df2, df, on=['Borough', 'Year'], how='inner')

In [117]:
merged_df= merged_df.drop(columns=['Year', 'Borough'])

In [118]:
y = merged_df['outcomeUnsuitableForSearch']
X = merged_df.drop(columns=['outcomeUnsuitableForSearch'])

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [121]:
y_pred = model.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.717205362592975
Confusion Matrix:
 [[    0 37526]
 [    0 95171]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     37526
           1       0.72      1.00      0.84     95171

    accuracy                           0.72    132697
   macro avg       0.36      0.50      0.42    132697
weighted avg       0.51      0.72      0.60    132697



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [122]:
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})
print(coefficients)

print("Intercept:", model.intercept_[0])

            Feature  Coefficient
0            Female     0.335909
1              Male     0.241399
2             Asian    -0.006467
3             Black    -0.028615
4             White    -0.062471
5  White_population     0.131521
6  Asian_population     0.162476
7  Black_population     0.155578
Intercept: 0.5814464392941927
