In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
columns_to_keep = ['Asian', 'Black', 'White', 'outcomeUnsuitableForSearch', 'Female', 'Male', 'Year', 'Borough name', 'Other.1', 'Other']
df = pd.read_excel("ethnic-group-by-borough.xlsx")
df.dropna(subset=['Code'], inplace=True)
df2 = pd.read_csv("prepared_search.csv", usecols=columns_to_keep)

In [3]:
# Cleaning and Renaming
df.dropna(subset=['Code'], inplace=True)
london_boroughs = ['Barking and Dagenham', 'Barnet', 'Bexley', 'Brent', 'Bromley', 'Camden', 'Croydon', 'Ealing','Enfield', 'Greenwich', 'Hackney', 'Hammersmith and Fulham', 'Haringey', 'Harrow', 'Havering','Hillingdon', 'Hounslow', 'Islington', 'Kensington and Chelsea', 'Kingston upon Thames', 'Lambeth', 'Lewisham', 'Merton', 'Newham', 'Redbridge', 'Richmond upon Thames', 'Southwark', 'Sutton', 'Tower Hamlets','Waltham Forest', 'Wandsworth', 'Westminster']
london_boroughs = [borough.lower() for borough in london_boroughs]
df = df[df['Area'].str.lower().isin(london_boroughs)]
df.rename(columns={'Number': 'White_population', 'Unnamed: 3': 'Asian_population', 'Unnamed: 4': 'Black_population', 'Unnamed: 5': 'Other_population', 'Unnamed: 6' : 'Total_population', 'Area': 'Borough', 'Unnamed: 13': 'Year'}, inplace=True)
columns_keep = ['Code', 'Borough', 'White_population', 'Asian_population', 'Black_population', 'Other_population', 'Total_population', 'Year']
df = df[columns_keep]
df.drop(columns=['Code'], inplace=True)
df.dropna(how='all', inplace=True)
columns_to_convert = ['White_population', 'Black_population', 'Other_population', 'Asian_population']
df[columns_to_convert] = df[columns_to_convert].astype(int)
# df[['Total_population']] = df[['Total_population']].astype(int)
for col in columns_to_convert:
    df[col] = df[col] / df['Total_population']

df.drop(columns=['Total_population', 'Other_population'], inplace=True)

In [4]:
df2.rename(columns={'Borough name': 'Borough', 'Other': 'Other_Gender', 'Other.1': 'Other_Ethnicity'}, inplace=True)

In [5]:
df['Borough'] = df['Borough'].str.lower()
df2['Borough'] = df2['Borough'].str.lower()
df['Year'] = df['Year'].astype(int)
df2['Year'] = df2['Year'].astype(int)
merged_df = pd.merge(df2, df, on=['Borough', 'Year'], how='inner')

In [6]:
merged_df= merged_df.drop(columns=['Year', 'Borough'])

In [None]:
merged_df.to_csv('Final.csv', index=False)

In [None]:
y = merged_df['outcomeUnsuitableForSearch']
X = merged_df.drop(columns=['outcomeUnsuitableForSearch'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))