In [74]:
import pandas as pd

crime_raw = pd.read_excel("/Users/souhil/Desktop/Courses/Recherche Empirique/Final/Data/Crimes_Leading_to_Expulsion_by_Residency_Status.xlsx")
pop_df = pd.read_excel(
    "/Users/souhil/Desktop/Courses/Recherche Empirique/Final/Data/Population_Count.xlsx", sheet_name='px-x-0102010000_101', header=None, skiprows=2)
unrelevant_crime_raw = pd.read_excel("/Users/souhil/Desktop/Courses/Recherche Empirique/Final/Data/Crimes_Not_Subject_to_Expulsion_by_Residency_Status.xlsx")

In [75]:
# Clean the population dataset

# Forward-fill missing years (since they're only listed once per Swiss/Foreigner pair)
pop_df[0] = pop_df[0].ffill()

# Filter and rename columns
pop_clean = pop_df[
    pop_df[5].isin(["Suisse", "Étranger"])  # 5th column contains nationality
][[0, 5, 6]]  # Select year(0), group(5), population(6)

pop_clean = pop_clean.rename(columns={
    0: "year",
    5: "group",
    6: "population"
})

# Map to English categories and clean data types
pop_clean["group"] = pop_clean["group"].map({
    "Suisse": "Swiss",
    "Étranger": "Foreigner"
})

pop_clean = pop_clean.dropna()
pop_clean["year"] = pop_clean["year"].astype(int)
pop_clean["population"] = pd.to_numeric(pop_clean["population"])

pop_clean.head()

Unnamed: 0,year,group,population
0,2010,Swiss,6103857.0
1,2010,Foreigner,1837112.0
2,2011,Swiss,6138668.0
3,2011,Foreigner,1896723.0
4,2012,Swiss,6169091.0


In [76]:
# Clean relevant crime dataset

# Extract years
years = crime_raw.iloc[1, 4:].tolist() 
print(years)

# Define group mapping for aggregation
group_mapping = {
    "suisses": "Swiss",
    "résidente": "Foreigner",
    "asile": "Foreigner",
    "autres": "Foreigner"}

# Process crime data
records = []
for row_idx in range(2, crime_raw.shape[0]):
    label = str(crime_raw.iloc[row_idx, 3]).lower()
    
    # Check if row contains group data
    for french_term in group_mapping:
        if french_term in label:
            group = group_mapping[french_term]
            values = crime_raw.iloc[row_idx, 4:4+len(years)]
            
            # Convert values raising errors to 0 
            counts = pd.to_numeric(values, errors='coerce').fillna(0)
            
            # Add to records
            for year, count in zip(years, counts):
                records.append({
                    "year": int(year),
                    "group": group,
                    "count": count
                })
            break 

# Create aggregated crime DataFrame
crimes_agg = pd.DataFrame(records)
crimes_agg = crimes_agg.groupby(['year', 'group'])['count'].sum().reset_index()

crimes_agg.head()

['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


Unnamed: 0,year,group,count
0,2009,Foreigner,4654.0
1,2009,Swiss,4238.0
2,2010,Foreigner,4450.0
3,2010,Swiss,4047.0
4,2011,Foreigner,4197.0


In [77]:
# Merge relevant crime with population data
final_df = pd.merge(
    crimes_agg,
    pop_clean,
    on=['year', 'group'],
    how='left')

# Calculate crime rate
final_df['crime_rate_per_100k'] = (final_df['count'] / final_df['population']) * 100_000
final_df['crime_rate_per_100k'] = final_df['crime_rate_per_100k'].round(2)

# Remove 2024 if population data unavailable
final_df = final_df[final_df['year'] <= 2023]

print(final_df)

    year      group   count  population  crime_rate_per_100k
0   2009  Foreigner  4654.0         NaN                  NaN
1   2009      Swiss  4238.0         NaN                  NaN
2   2010  Foreigner  4450.0   1837112.0               242.23
3   2010      Swiss  4047.0   6103857.0                66.30
4   2011  Foreigner  4197.0   1896723.0               221.28
5   2011      Swiss  3701.0   6138668.0                60.29
6   2012  Foreigner  4417.0   1954630.0               225.98
7   2012      Swiss  3725.0   6169091.0                60.38
8   2013  Foreigner  4111.0   2020143.0               203.50
9   2013      Swiss  3408.0   6202184.0                54.95
10  2014  Foreigner  3977.0   2085347.0               190.71
11  2014      Swiss  3743.0   6239207.0                59.99
12  2015  Foreigner  3697.0   2145153.0               172.34
13  2015      Swiss  3351.0   6278459.0                53.37
14  2016  Foreigner  4128.0   2178909.0               189.45
15  2016      Swiss  361

In [78]:
# Extract the cleaned relevant crime dataset
final_df.to_excel("/Users/souhil/Desktop/Courses/Recherche Empirique/Final/Data/cleaned/Crimes_Subject_to_Expulsion_by_Residency_Status_cleaned.xlsx", index=False)

In [79]:
# Clean unrelevant crime dataset

# Extract years
years = unrelevant_crime_raw.iloc[1, 4:].tolist() 
print(years)

# Define group mapping for aggregation
group_mapping = {
    "suisses": "Swiss",
    "résidente": "Foreigner",
    "asile": "Foreigner",
    "autres": "Foreigner"}

# Process crime data
records = []
for row_idx in range(2, unrelevant_crime_raw.shape[0]):
    label = str(unrelevant_crime_raw.iloc[row_idx, 3]).lower()
    
    # Check if row contains group data
    for french_term in group_mapping:
        if french_term in label:
            group = group_mapping[french_term]
            values = unrelevant_crime_raw.iloc[row_idx, 4:4+len(years)]
            
            # Convert values raising errors to 0 
            counts = pd.to_numeric(values, errors='coerce').fillna(0)
            
            # Add to records
            for year, count in zip(years, counts):
                records.append({
                    "year": int(year),
                    "group": group,
                    "count": count
                })
            break 

# Create aggregated crime DataFrame
unrelevant_crimes_agg = pd.DataFrame(records)
unrelevant_crimes_agg = unrelevant_crimes_agg.groupby(['year', 'group'])['count'].sum().reset_index()

unrelevant_crimes_agg.head()

['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


Unnamed: 0,year,group,count
0,2009,Foreigner,115329.0
1,2009,Swiss,122208.0
2,2010,Foreigner,116362.0
3,2010,Swiss,117627.0
4,2011,Foreigner,119782.0


In [80]:
# Merge unrelevant crimes with population data
unrelevant_final_df = pd.merge(
    unrelevant_crimes_agg,
    pop_clean,
    on=['year', 'group'],
    how='left')

# Calculate crime rate
unrelevant_final_df['crime_rate_per_100k'] = (unrelevant_final_df['count'] / unrelevant_final_df['population']) * 100_000
unrelevant_final_df['crime_rate_per_100k'] = unrelevant_final_df['crime_rate_per_100k'].round(2)

# Remove 2024 if population data unavailable
unrelevant_final_df = unrelevant_final_df[unrelevant_final_df['year'] <= 2023]

print(unrelevant_final_df)

    year      group     count  population  crime_rate_per_100k
0   2009  Foreigner  115329.0         NaN                  NaN
1   2009      Swiss  122208.0         NaN                  NaN
2   2010  Foreigner  116362.0   1837112.0              6333.96
3   2010      Swiss  117627.0   6103857.0              1927.09
4   2011  Foreigner  119782.0   1896723.0              6315.21
5   2011      Swiss  112087.0   6138668.0              1825.92
6   2012  Foreigner  137298.0   1954630.0              7024.24
7   2012      Swiss  114093.0   6169091.0              1849.43
8   2013  Foreigner  137416.0   2020143.0              6802.29
9   2013      Swiss  112140.0   6202184.0              1808.07
10  2014  Foreigner  129312.0   2085347.0              6200.98
11  2014      Swiss  113356.0   6239207.0              1816.83
12  2015  Foreigner  124646.0   2145153.0              5810.59
13  2015      Swiss  111407.0   6278459.0              1774.43
14  2016  Foreigner  127991.0   2178909.0              

In [81]:
# Extract the cleaned unrelevant crime dataset
unrelevant_final_df.to_excel("/Users/souhil/Desktop/Courses/Recherche Empirique/Final/Data/cleaned/Crimes_Not_Subject_to_Expulsion_by_Residency_Status_cleaned.xlsx", index=False)