In [None]:
# This notebook merges all provincial datasets together and runs some initial analyses
# Dataset merged:
# '../supplementary_data/combined_violence_by_province.pkl'
# '../supplementary_data/protest_data_all.pkl'
# '../supplementary_data/provincial_data_2019.pkl'

In [None]:
import pandas as pd
import numpy as np

# Merge GDP data

In [None]:
df_gdp_pc = pd.read_excel('../gdp_percapita_province.xlsx')

In [None]:
# Create a mapping dictionary for standardization
province_mapping = {
    '上海市': '上海', 
    '北京市': '北京',
    '天津市': '天津',
    '重庆市': '重庆',
    '西藏自治区': '西藏', 
    '新疆维吾尔自治': '新疆',
    '内蒙古自治区': '内蒙古', 
    '黑龙江省': '黑龙江', 
    '吉林省': '吉林', 
    '辽宁省': '辽宁', 
    '河北省': '河北', 
    '山东省': '山东', 
    '河南省': '河南', 
    '江苏省': '江苏', 
    '安徽省': '安徽', 
    '浙江省': '浙江', 
    '福建省': '福建', 
    '广东省': '广东', 
    '广西壮族自治区': '广西', 
    '湖南省': '湖南', 
    '湖北省': '湖北', 
    '江西省': '江西', 
    '四川省': '四川', 
    '贵州省': '贵州', 
    '云南省': '云南',
    '陕西省': '陕西', 
    '甘肃省': '甘肃', 
    '山西省': '山西', 
    '海南省': '海南',
    '宁夏回族自治区': '宁夏',
    '新疆维吾尔自治区': '新疆',
    '青海省': '青海'
    } # generated by Chatgpt
# Clean up the column by stripping spaces, tabs, newlines, and applying the mapping
df_gdp_pc['Province'] = df_gdp_pc['Province'].str.strip().replace(province_mapping)

In [None]:
df_gdp_pc.columns = ['province', 'gdp_pc_2021', 'gdp_pc_2020', 'gdp_pc_2015', 'gdp_pc_2010', 'gdp_pc_2000']
df_gdp_pc

In [None]:
# Define the data
data = {
    '省份': [
        '中国大陆', '广东省', '江苏省', '山东省', '浙江省', '河南省', '四川省', '湖北省', 
        '福建省', '湖南省', '安徽省', '上海市', '河北省', '北京市', '陕西省', '江西省', 
        '重庆市', '辽宁省', '云南省', '广西壮族自治区', '山西省', '内蒙古自治区', '贵州省', 
        '新疆维吾尔自治区', '天津市', '黑龙江省', '吉林省', '甘肃省', '海南省', '宁夏回族自治区', 
        '青海省', '西藏自治区'
    ],
    'gdp_2022': [
        121020.72, 12911.86, 12287.56, 8743.51, 7771.54, 6134.51, 5674.98, 5373.49, 
        5310.99, 4867.04, 4504.50, 4465.28, 4237.04, 4161.10, 3277.27, 3207.47, 
        2912.90, 2897.51, 2895.42, 2630.09, 2564.26, 2315.87, 2016.46, 1774.13, 
        1631.13, 1590.10, 1307.02, 1120.16, 681.82, 506.96, 361.01, 213.26
    ],
    'gdp_2021': [
        114923.70, 12471.95, 11739.24, 8287.52, 7404.08, 5807.14, 5408.80, 5009.12, 
        4956.61, 4571.35, 4256.52, 4365.32, 4039.71, 4104.56, 3012.17, 2982.78, 
        2807.73, 2756.95, 2716.16, 2520.91, 2287.04, 2116.60, 1945.86, 1631.16, 
        1568.51, 1485.82, 1316.38, 1022.55, 650.41, 458.82, 338.51, 208.02
    ],
    'gdp_2020': [
        101356.70, 11115.16, 10280.77, 7279.82, 6468.91, 5425.94, 4850.16, 4300.45, 
        4360.86, 4154.26, 3806.15, 3896.33, 3601.38, 3594.33, 2601.41, 2578.20, 
        2504.14, 2501.14, 2455.57, 2212.09, 1783.56, 1725.80, 1786.04, 1380.07, 
        1400.80, 1363.34, 1225.60, 897.97, 556.62, 395.63, 300.98, 190.27
    ],
    'gdp_2015': [
        68885.82, 7473.24, 7125.59, 5528.88, 4350.77, 3708.41, 3034.20, 3034.40, 
        2681.95, 2853.86, 2383.12, 2688.70, 2639.84, 2477.91, 1789.88, 1678.09, 
        1604.05, 2021.03, 1496.00, 1479.78, 1183.64, 1294.90, 1054.10, 930.69, 
        1087.95, 1169.00, 1001.80, 655.66, 373.42, 257.94, 201.10, 104.30
    ],
    'gdp_2010': [
        41211.93, 4594.46, 4138.39, 3392.25, 2739.99, 2265.50, 1722.48, 1622.69, 
        1500.25, 1557.43, 1324.98, 1791.54, 1800.36, 1496.40, 984.52, 938.32, 
        806.53, 1389.63, 773.53, 855.24, 890.39, 819.99, 451.90, 536.02, 
        683.08, 830.83, 641.05, 394.37, 202.05, 157.17, 114.42, 51.29
    ],
    'gdp_2000': [
        10028.01, 1081.02, 855.37, 827.81, 616.48, 505.30, 392.82, 354.54, 
        376.45, 355.15, 312.53, 481.22, 462.82, 327.78, 180.40, 200.31, 
        182.21, 466.91, 203.01, 208.00, 184.57, 153.91, 102.99, 136.36, 
        159.17, 285.55, 175.14, 105.29, 52.68, 29.50, 26.37, 11.78
    ]
}

# Create the DataFrame
df_gdp = pd.DataFrame(data)


print(df_gdp)


In [None]:
df_gdp['省份'] = df_gdp['省份'].str.strip().replace(province_mapping)
df_gdp.rename(columns = {'省份': 'province'}, inplace = True)
df_gdp

In [None]:
df_gdp.sort_values(by = 'gdp_2022', ascending = False)

In [None]:
df = pd.read_pickle('../all_tweets_cleaned_final.pkl')

In [None]:
df.info()

In [None]:
df['province'].value_counts().index

In [None]:
total_cases_by_province = df['province'].value_counts().reset_index()
total_cases_by_province.columns = ('province', 'total_violence_cases')

In [None]:
df_merged = pd.merge(total_cases_by_province, df_gdp_pc, on = 'province')
df_merged = pd.merge(df_merged, df_gdp, on = 'province')
df_merged

In [None]:
from scipy.stats import pearsonr
corr, pvalue = pearsonr(df_merged['total_violence_cases'], df_merged['gdp_pc_2021'])
print(corr, pvalue)

In [None]:
corr, pvalue = pearsonr(df_merged['total_violence_cases'], df_merged['gdp_2022'])
print(corr, pvalue)

In [None]:
df_cate = df[['province', 'dispute_area', 'worst_violence_types', 'types_of_perpetrator']]
df_cate

In [None]:
df_with_dummies = pd.get_dummies(df_cate, columns = ['dispute_area', 'worst_violence_types', 'types_of_perpetrator'])

In [None]:
df_with_dummies.info()

In [None]:
average_degree_violence = df_cate.groupby('province').mean('worst_violence_types')
average_degree_violence = average_degree_violence['worst_violence_types']

In [None]:
sum_by_province = df_with_dummies.groupby('province').sum()

In [None]:
df_merged = pd.merge(df_merged, sum_by_province, on = 'province')
df_merged = pd.merge(df_merged, average_degree_violence, on = 'province')
df_merged.info()

In [None]:
df_merged.columns

In [None]:
columns_keep = ['total_violence_cases', 'gdp_2022', 'gdp_pc_2021', 'dispute_area_1.0',
       'dispute_area_2.0', 'dispute_area_3.0', 'dispute_area_4.0',
       'dispute_area_5.0', 'dispute_area_6.0', 
       'types_of_perpetrator_1.0', 'types_of_perpetrator_2.0',
       'types_of_perpetrator_3.0', 'worst_violence_types']
df_keep = df_merged[columns_keep]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = df_keep.corr()

# Set the size of the heatmap
plt.figure(figsize=(10, 8))

# Draw a heatmap with the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)

# Display the heatmap
plt.show()


In [None]:
from scipy.stats import pearsonr

def calculate_pvalues(df):
    # Initialize an empty dataframe to store p-values
    pvalues = pd.DataFrame(np.ones((df.shape[1], df.shape[1])), columns=df.columns, index=df.columns)

    # Iterate over each pair of columns in the dataframe
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 != col2:  # Avoid calculating p-values for diagonal
                corr, pval = pearsonr(df[col1], df[col2])
                pvalues.loc[col1, col2] = pval
    return pvalues

# Calculate the p-values for the correlation matrix
pvalue_matrix = calculate_pvalues(df_keep)

# Select only the p-values less than 0.05
significant_pvalues = pvalue_matrix[pvalue_matrix < 0.05]

# Display the filtered p-values (NaN where p-values are >= 0.05)
print(significant_pvalues)

In [None]:
# correlation coefficients between gdp and case numbers
sig_list = ['total_violence_cases', 'dispute_area_1.0',
       'dispute_area_2.0', 'dispute_area_4.0',
       'dispute_area_5.0', 'dispute_area_6.0', 
       'types_of_perpetrator_1.0', 'types_of_perpetrator_2.0',
       'types_of_perpetrator_3.0']

for var in sig_list:
    corr, pvalue = pearsonr(df_keep['gdp_2022'], df_keep[var])
    print(f'coefficient of {var} and gdp_2022: {corr}, pvalue: {pvalue}')


In [None]:
corr, pvalue = pearsonr(df_keep['gdp_pc_2021'], df_keep['worst_violence_types'])
print(f'correlation coefficient of GDP per capital in 2021 and average degrees of violence is {corr}, p-value is {pvalue}')

In [None]:
# Merge gdp data onto the original dataset
df_merged_full = pd.merge(df, df_gdp, on = 'province')
df_merged_full = pd.merge(df_merged_full, df_gdp_pc, on = 'province')
df_merged_full.columns

In [None]:
corr, pvalue = pearsonr(df_merged_full['worst_violence_types'], df_merged_full['gdp_pc_2021'])
print(corr, pvalue)

# Merge protest data

In [None]:
prot_df = pd.read_pickle('../supplementary_data/protest_data_all.pkl')

In [None]:
prot_df.info()

In [None]:
# calculate number of protest by province
prot_df['Province'].value_counts().index

In [None]:
prot_df[prot_df['Province'] == '哪门股']

In [None]:
province_mapping = {
    '广东省': '广东', '河南省': '河南', '山东省': '山东', '陕西省': '陕西', 
    '江苏省': '江苏', '广东': '广东', '河北省': '河北', '浙江省': '浙江', 
    '四川省': '四川', '河南': '河南', '山东': '山东', '福建省': '福建', 
    '河北': '河北', '湖南省': '湖南', '湖北省': '湖北', '辽宁省': '辽宁', 
    '北京市': '北京', '四川': '四川', '陕西': '陕西', '上海市': '上海', 
    '山西省': '山西', '吉林省': '吉林', '江苏': '江苏', '黑龙江省': '黑龙江', 
    '安徽省': '安徽', '内蒙古自治区': '内蒙古', '辽宁': '辽宁', '湖南': '湖南', 
    '广西壮族自治区': '广西', '湖北': '湖北', '重庆市': '重庆', '贵州省': '贵州', 
    '江西省': '江西', '甘肃省': '甘肃', '云南省': '云南', '浙江': '浙江', 
    '北京': '北京', '上海': '上海', '天津市': '天津', '广西': '广西', 
    '新疆维吾尔自治区': '新疆', '-': None, '山西': '山西', '江西': '江西', 
    '福建': '福建', '吉林': '吉林', '重庆': '重庆', '海南省': '海南', 
    '安徽': '安徽', '黑龙江': '黑龙江', '内蒙古': '内蒙古', '云南': '云南', 
    '宁夏回族自治区': '宁夏', '海南': '海南', '天津': '天津', '贵州': '贵州', 
    '甘肃': '甘肃', '青海省': '青海', '西藏自治区': '西藏', '新疆': '新疆', 
    '西藏': '西藏', '香港特别行政区': '香港', '宁夏': '宁夏', '青海': '青海', 
    '江蘇省': '江苏', '重慶市': '重庆', '哪门股': '内蒙古'
}
# Canonize the list
prot_df['Province'] = prot_df['Province'].replace(province_mapping)

In [None]:
total_protests_by_province = prot_df['Province'].value_counts().reset_index()
total_protests_by_province.columns = ('province', 'total_protest_cases')
total_protests_by_province

In [None]:
df_merged = pd.merge(df_merged, total_protests_by_province, on = 'province')
df_merged.info()

In [None]:
corr, pvalue = pearsonr(df_merged['total_violence_cases'], df_merged['total_protest_cases'])
print(f'correlation coefficient of total violence case numbers and total protest case numbers is {corr}, p-value is {pvalue}')

In [None]:
cols_to_keep = ['province', 'total_violence_cases', 'gdp_pc_2021', 'gdp_2022', 'dispute_area_1.0', 'dispute_area_2.0',\
               'dispute_area_3.0', 'dispute_area_4.0', 'dispute_area_5.0', 'dispute_area_6.0', 'worst_violence_types',\
                'total_protest_cases']
df_merged_keep = df_merged[cols_to_keep]

In [None]:
df_merged_keep.columns = ('province', 'total_violence_cases_Weibo', 'gdp_per_capita_2021', 'gdp_2022', 'dispute_area_land_housing', 'dispute_area_financial', 'dispute_area_employment', 'dispute_area_political_legal_policing', 'dispute_area_covid19', 'dispute_area_personal', 'average_severity_of_violence_Weibo', 'total_protest_cases')

In [None]:
df_merged_keep.to_excel('../supplementary_data/provincial_data.xlsx')

In [None]:
x = df_merged_keep['province']
y1 = df_merged_keep['total_protest_cases']
y2 = df_merged_keep['total_violence_cases_Weibo']

# Standardize the values to compare

y1_mean = y1.mean()
y1_std_dev = y1.std()
y1_scaled = (y1 - y1_mean) / y1_std_dev

y2_mean = y2.mean()
y2_std_dev = y2.std()
y2_scaled = (y2 - y2_mean) / y2_std_dev



# Create the plot
plt.figure(figsize=(10, 5))
plt.plot(x, y1_scaled, label='total_protest_cases', color='blue', marker='o')
plt.plot(x, y2_scaled, label='total_hired_violence_cases_Weibo', color='orange', marker='x')

# Add titles and labels
plt.title('Compare hired violence cases in Weibo and total protest cases by province')
plt.xlabel('Provinces')
plt.ylabel('Number of cases (standardized)')

# Show legend
plt.legend()

# Show the plot
plt.show()


In [None]:
corr, pvalue = pearsonr(y1_scaled, y2_scaled)
print(f'correlation coefficient is {corr}, p-value is {pvalue}')

# Merge Real Estate Data

In [None]:
df_re = pd.read_pickle('../supplementary_data/real_estate_data_2019.pkl')

In [None]:
df_re.info()

In [None]:
df_merged_re = pd.merge(df_merged_keep, df_re, on = 'province')

In [None]:
# Calculate violence cases per capita: divide total violence cases by resident population
df_merged_re['violence_per_capita'] = df_merged_re['total_violence_cases_Weibo']/df_merged_re['resident_population']


In [None]:
# Calculate proportion of land_revenue out of total revenue
df_merged_re['land_by_revenue'] = df_merged_re['land_sale']/df_merged_re['revenue']

In [None]:
df_merged_re.columns

In [None]:
cols_for_corr = ['total_violence_cases_Weibo', 'gdp_per_capita_2021',
       'gdp_2022', 'dispute_area_land_housing', 'dispute_area_financial',
       'dispute_area_employment', 'dispute_area_political_legal_policing',
       'dispute_area_covid19', 'dispute_area_personal',
       'average_severity_of_violence_Weibo', 'total_protest_cases', 'crime_rate', 'gdppc', 'cpi', 'securitypc', 'unemp_rate',
       'kid_population', 'expenditure', 'mig_population', 'population',
       'realest_invest', 'realest_landspending', 'revenue',
       'state_owned_resource_income', 'urban_population',
       'resident_population', 'urban_income', 'rural_income', 'servants',
       'land_sale', 'youth_pct', 'mig_pct', 'landrev_byexp',
       'landrev_byinvest', 'resource_inc', 'org', 'urb_pct',
       'urb_rural_income', 'servant_ratio', 'violence_per_capita', 'land_by_revenue']
df_corr = df_merged_re[cols_for_corr]

In [None]:
correlation_matrix = df_corr.corr()

In [None]:
from scipy.stats import pearsonr

def calculate_pvalues(df):
    # Initialize an empty dataframe to store p-values
    pvalues = pd.DataFrame(np.ones((df.shape[1], df.shape[1])), columns=df.columns, index=df.columns)

    # Iterate over each pair of columns in the dataframe
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 != col2:  # Avoid calculating p-values for diagonal
                corr, pval = pearsonr(df[col1], df[col2])
                pvalues.loc[col1, col2] = pval
    return pvalues

# Calculate the p-values for the correlation matrix
pvalue_matrix = calculate_pvalues(df_corr)

# Select only the p-values less than 0.05
significant_pvalues = pvalue_matrix[pvalue_matrix < 0.05]

# Display the filtered p-values (NaN where p-values are >= 0.05)
print(significant_pvalues)

In [None]:
# correlation coefficients between case numbers and other variables
sig_list = ['gdp_2022', 'cpi', 'securitypc', 'kid_population', 'expenditure', 'population', 'realest_invest', 'revenue', 'state_owned_resource_income', 'urban_population', 'resident_population', 'servants', 'land_sale', 'landrev_byexp', 'org']


for var in sig_list:
    corr, pvalue = pearsonr(df_corr['total_violence_cases_Weibo'], df_corr[var])
    print(f'coefficient of {var} and total_violence_cases_Weibo: {corr}, pvalue: {pvalue}')


In [None]:
df_merged_re['violence_per_capita']

In [None]:
corr, pvalue = pearsonr(df_merged_re['total_violence_cases_Weibo'], df_merged_re['land_sale'])
print(f'correlation coefficient is {corr}, p-value is {pvalue}')

In [None]:
corr, pvalue = pearsonr(df_merged_re['average_severity_of_violence_Weibo'], df_merged_re['securitypc'])
print(f'correlation coefficient is {corr}, p-value is {pvalue}')

# Merge combined_violence_by_province

In [None]:
vio_df = pd.read_pickle('../supplementary_data/combined_violence_by_province.pkl')

In [None]:
df_merged_re = pd.merge(df_merged_re, vio_df, on = 'province')

In [None]:
x = df_merged_re['province']
y1 = df_merged_re['total_violence_cases']
y2 = df_merged_re['total_violence_cases_Weibo']


# Standardize the values to compare

y1_mean = y1.mean()
y1_std_dev = y1.std()
y1_scaled = (y1 - y1_mean) / y1_std_dev

y2_mean = y2.mean()
y2_std_dev = y2.std()
y2_scaled = (y2 - y2_mean) / y2_std_dev

# Create the plot
plt.figure(figsize=(10, 5))
plt.plot(x, y1_scaled, label='combined_violence_cases', color='blue', marker='o')
plt.plot(x, y2_scaled, label='total_hired_violence_cases_Weibo', color='orange', marker='x')

# Add titles and labels
plt.title('Compare hired violence cases in Weibo and combined violence cases by province')
plt.xlabel('Provinces')
plt.ylabel('Number of cases (standardized)')

# Show legend
plt.legend()

# Show the plot
plt.show()


In [None]:
corr, pvalue = pearsonr(y1_scaled, y2_scaled)
print(f'correlation coefficient is {corr}, p-value is {pvalue}')

In [None]:
corr, pvalue = pearsonr(df_merged_re['total_violence_cases'], df_merged_re['total_protest_cases'])
print(f'correlation coefficient is {corr}, p-value is {pvalue}')

In [None]:
## Can we use this plot to validate our data?

# Regressions - total_violence_cases_Weibo

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [None]:
df_merged_re.columns

In [None]:
sns.histplot(df_merged_re['total_violence_cases_Weibo'], kde = True)

In [None]:
# Lasso 
# Separate the features and the target variable
y = df_merged_re['total_violence_cases_Weibo']
indi_var = ['gdp_per_capita_2021',
       'gdp_2022', 
       'crime_rate', 'gdppc', 'cpi', 'securitypc', 'unemp_rate',
       'kid_population', 'expenditure', 'mig_population', 'population',
       'realest_invest', 'realest_landspending', 'revenue',
       'state_owned_resource_income', 'urban_population',
       'resident_population', 'urban_income', 'rural_income', 'servants',
       'land_sale', 'youth_pct', 'mig_pct', 'landrev_byexp',
       'landrev_byinvest', 'resource_inc', 'org', 'urb_pct',
       'urb_rural_income', 'servant_ratio']
x = df_merged_re[indi_var]

# Standardize the features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)


# Convert `y` (which is a pandas Series) to a NumPy array and reshape it
y_reshaped = y.values.reshape(-1, 1)  # Convert Series to array and reshape

# Scale `y` after reshaping
y_scaled = scaler.fit_transform(y_reshaped)

# If you want to return `y_scaled` to 1D
y_scaled = y_scaled.ravel()

lasso = Lasso(alpha=0.5)  # Set the regularization strength with the 'alpha' parameter
lasso.fit(x_scaled, y_scaled)

# Access the coefficients
coefficients = lasso.coef_
intercept = lasso.intercept_

for feature, coef in zip(x.columns, coefficients):
    print(f"{feature}: {coef}")

In [None]:
# ! pip install statsmodels

In [None]:
import statsmodels.api as sm

y = df_merged_re['total_violence_cases_Weibo']
selected_var = [
       'gdp_2022', 
       'cpi', 'securitypc', 
       'kid_population', 'expenditure', 'population',
       'realest_invest', 'revenue',
       'state_owned_resource_income', 'urban_population',
       'resident_population', 'servants',
       'land_sale', 'landrev_byexp',
       'org']
x = df_merged_re[selected_var]

# # scaling
# scaler = StandardScaler()
# x_scaled = scaler.fit_transform(x)

# # Convert `y` (which is a pandas Series) to a NumPy array and reshape it
# y_reshaped = y.values.reshape(-1, 1)  # Convert Series to array and reshape

# # Scale `y` after reshaping
# y_scaled = scaler.fit_transform(y_reshaped)

# # If you want to return `y_scaled` to 1D
# y_scaled = y_scaled.ravel()

# Add a constant (intercept term) to the model
x = sm.add_constant(x)  # Adds a column of ones for the intercept term

# Fit the OLS model
ols_model = sm.OLS(y, x).fit()

# Step 6: Print out the summary of the regression
print(ols_model.summary())

import statsmodels.api as sm

# Fit the robust regression model
robust_model = sm.RLM(y, x).fit()

# Print the summary of the robust model
print(robust_model.summary())


# Regression - ‘average_severity_of_violence_Weibo’

In [None]:
# # total_case_severity
# df_merged_re['total_severity'] = df_merged_re['average_severity_of_violence_Weibo'] * df_merged_re['total_violence_cases_Weibo']

In [None]:
# logarithm
df_merged_re['log_average_severity_of_violence_Weibo'] = np.log(df_merged_re['average_severity_of_violence_Weibo'])

In [None]:
# distribution of the outcome variable
sns.histplot(df_merged_re['log_average_severity_of_violence_Weibo'], bins=30, kde=True)

In [None]:
# Separate the features and the target variable
y = df_merged_re['log_average_severity_of_violence_Weibo']
indi_var = ['gdp_per_capita_2021','urban_income', 'rural_income', 'youth_pct', 'mig_pct', 
       'landrev_byinvest', 'urb_pct',
       'urb_rural_income']
x = df_merged_re[indi_var]

# Add a constant (intercept term) to the model
x = sm.add_constant(x)  # Adds a column of ones for the intercept term

# Fit the OLS model
ols_model = sm.OLS(y, x).fit()

# Print out the summary of the regression
print(ols_model.summary())

# Fit the robust regression model
robust_model = sm.RLM(y, x).fit()

# Print the summary of the robust model
print(robust_model.summary())


In [None]:
# Using Lasso to select variables
# Separate the features and the target variable
y = df_merged_re['log_average_severity_of_violence_Weibo']
indi_var = ['gdp_per_capita_2021',
       'gdp_2022', 
       'crime_rate', 'gdppc', 'cpi', 'securitypc', 'unemp_rate',
       'kid_population', 'expenditure', 'mig_population', 'population',
       'realest_invest', 'realest_landspending', 'revenue',
       'state_owned_resource_income', 'urban_population',
       'resident_population', 'urban_income', 'rural_income', 'servants',
       'land_sale', 'youth_pct', 'mig_pct', 'landrev_byexp',
       'landrev_byinvest', 'resource_inc', 'org', 'urb_pct',
       'urb_rural_income', 'servant_ratio']
x = df_merged_re[indi_var]

# Standardize the features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)


# Convert `y` (which is a pandas Series) to a NumPy array and reshape it
y_reshaped = y.values.reshape(-1, 1)  # Convert Series to array and reshape

# Scale `y` after reshaping
y_scaled = scaler.fit_transform(y_reshaped)

# If you want to return `y_scaled` to 1D
y_scaled = y_scaled.ravel()


lasso = Lasso(alpha=0.5)  # Set the regularization strength with the 'alpha' parameter
lasso.fit(x_scaled, y_scaled)

# Access the coefficients
coefficients = lasso.coef_
intercept = lasso.intercept_

for feature, coef in zip(x.columns, coefficients):
    print(f"{feature}: {coef}")

# Regression: 'dispute_area_land_housing'

In [None]:
# distribution of the outcome variable
sns.histplot(df_merged_re['dispute_area_land_housing'], bins=30, kde=True)

In [None]:
df_merged_re.columns

In [None]:
# df_merged_re['urban_pop_ratio'] = df_merged_re['urban_population']/df_merged_re['resident_population']

In [None]:
# Separate the features and the target variable
y = df_merged_re['dispute_area_land_housing']
indi_var = ['gdp_2022', 'cpi', 
       'kid_population', 'population',
       'urban_population',
       'resident_population', 'servants',
       'org']
x = df_merged_re[indi_var]

# Add a constant (intercept term) to the model
x = sm.add_constant(x)  # Adds a column of ones for the intercept term

# Fit the OLS model
ols_model = sm.OLS(y, x).fit()

# Print out the summary of the regression
print(ols_model.summary())

# Fit the robust regression model
robust_model = sm.RLM(y, x).fit()

# Print the summary of the robust model
print(robust_model.summary())


# Check violence case per person

In [None]:
# Define the data as a list of dictionaries
pop_2020 = {
    'province': ['北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '上海', '江苏', '浙江', '安徽', '福建', '江西', 
                 '山东', '河南', '湖北', '湖南', '广东', '广西', '海南', '重庆', '四川', '贵州', '云南', '西藏', '陕西', 
                 '甘肃', '青海', '宁夏', '新疆'],
    'population_2020': [21893095, 13866009, 74610235, 34915616, 24049155, 42591407, 24073453, 31850088, 24870895, 84748016, 
                   64567588, 61027171, 41540086, 45188635, 101527453, 99365519, 57752557, 66444864, 126012510, 50126804, 
                   10081232, 32054159, 83674866, 38562148, 47209277, 3648100, 39528999, 25019831, 5923957, 7202654, 25852345]
}

# Create the DataFrame
df_population = pd.DataFrame(pop_2020)

# Show the DataFrame
df_population


In [None]:
df_merged_re = pd.merge(df_merged_re, df_population, on = 'province')

In [None]:
df_merged_re = pd.merge(df_merged_re, df_gdp_pc, on = 'province')

In [None]:
df_merged_re = pd.merge(df_merged_re, df_gdp, on = 'province')

In [None]:
df_merged_re.columns

In [None]:
df_merged_re['violence_case_per_million_person'] = (df_merged_re['total_violence_cases_Weibo'] * 1000000)/df_merged_re['population_2020']

In [None]:
df_merged_re['gdppc_2020'] = df_merged_re['gdp_2020']/df_merged_re['population_2020']

In [None]:
df_merged_re[['gdppc_2020', 'gdp_pc_2020']]

In [None]:
corr, pvalue = pearsonr(df_merged_re['violence_case_per_person'], df_merged_re['gdppc_2020'])
print(corr, pvalue)

In [None]:
corr, pvalue = pearsonr(df_merged_re['total_violence_cases_Weibo'], df_merged_re['population_2020'])
print(corr, pvalue)

In [None]:
df_merged_re.info()

# Merge corruption data

In [None]:
crp_data = pd.read_pickle('../supplementary_data/corruption_data_2020.pkl')

In [None]:
df_merged_re = pd.merge(df_merged_re, crp_data, on = 'province')

In [None]:
df_merged_re.info()

# Merge audit data

In [None]:
audit = pd.read_pickle('../supplementary_data/audit_data_2019.pkl')

In [None]:
df_merged_re = pd.merge(df_merged_re, audit, on = 'province')

In [None]:
df_merged_re.columns

In [None]:
col_for_reg = ['province', 'total_violence_cases_Weibo', 'gdp_per_capita_2021',
       'gdp_2022_x', 'average_severity_of_violence_Weibo', 'total_protest_cases', 'Year',
       'Region', 'crime_rate', 'gdppc', 'cpi', 'securitypc', 'unemp_rate',
       'kid_population', 'expenditure', 'mig_population', 'population',
       'realest_invest', 'realest_landspending', 'revenue',
       'state_owned_resource_income', 'urban_population',
       'resident_population', 'urban_income', 'rural_income', 'servants',
       'land_sale', 'youth_pct', 'mig_pct', 'landrev_byexp',
       'landrev_byinvest', 'resource_inc', 'org', 'urb_pct',
       'urb_rural_income', 'servant_ratio', 'violence_per_capita',
       'land_by_revenue', 'total_violence_cases', 'hired_cases', 'hired_ratio',
       'log_average_severity_of_violence_Weibo', 'population_2020',
       'gdp_pc_2021', 'gdp_pc_2020', 'gdp_pc_2015', 'gdp_pc_2010',
       'gdp_pc_2000', 'gdp_2022_y', 'gdp_2021', 'gdp_2020', 'gdp_2015',
       'gdp_2010', 'gdp_2000', 'violence_case_per_million_person',
       'gdppc_2020', 'number_of_corruption_cases',
       'number_of_audit_cases_2019']