In [1]:
import os
from os.path import join
from google.colab import drive

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
drive.mount("/content/drive/")
directory="/content/drive/MyDrive/data"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
files = [f for f in os.listdir(directory) if os.path.isfile(join(directory, f))]
files

['R1400.xlsx',
 'U1400.xlsx',
 'R99.xlsx',
 'U99.xlsx',
 'R1401.xlsx',
 'U1401.xlsx',
 'R98.xlsx',
 'U98.xlsx']

In [4]:
all_files = {}

for file_name in files:
    file_path = join(directory, file_name)
    with pd.ExcelFile(file_path) as excel_file:
        sheets = {sheet: pd.read_excel(excel_file, sheet) for sheet in excel_file.sheet_names}
    all_files[file_name.split('.')[0]] = sheets

In [5]:
for file_name, sheets in all_files.items():
    print(f"File: {file_name}")
    for sheet_name, data in sheets.items():
        print(f"  Sheet: {sheet_name}, Data shape: {data.shape}")

File: R1400
  Sheet: R1400Data, Data shape: (18370, 18)
  Sheet: R1400P1, Data shape: (62972, 10)
  Sheet: R1400P2, Data shape: (19452, 45)
  Sheet: R1400P3S01, Data shape: (509426, 7)
  Sheet: R1400P3S02, Data shape: (3621, 7)
  Sheet: R1400P3S03, Data shape: (26019, 4)
  Sheet: R1400P3S04, Data shape: (69396, 5)
  Sheet: R1400P3S05, Data shape: (87768, 4)
  Sheet: R1400P3S06, Data shape: (29465, 4)
  Sheet: R1400P3S07, Data shape: (32866, 4)
  Sheet: R1400P3S08, Data shape: (34928, 4)
  Sheet: R1400P3S09, Data shape: (9523, 4)
  Sheet: R1400P3S10, Data shape: (0, 4)
  Sheet: R1400P3S11, Data shape: (9206, 4)
  Sheet: R1400P3S12, Data shape: (66456, 4)
  Sheet: R1400P3S13, Data shape: (180585, 3)
  Sheet: R1400P3S14, Data shape: (4014, 4)
  Sheet: R1400P4S01, Data shape: (10731, 19)
  Sheet: R1400P4S02, Data shape: (12194, 19)
  Sheet: R1400P4S03, Data shape: (25097, 11)
  Sheet: R1400P4S04, Data shape: (32053, 8)
File: U1400
  Sheet: U1400Data, Data shape: (19618, 18)
  Sheet: R1400P

In [6]:
import copy
data_copy = copy.deepcopy(all_files)

In [7]:
cost_sheets = ["P3S01", "P3S02", "P3S03", "P3S04", "P3S05", "P3S06", "P3S07",
               "P3S08", "P3S09", "P3S10","P3S11","P3S12","P3S13",]
income_sheets = ["P4S01", "P4S02", "P4S03", "P4S04"]
required_sheets = []
required_sheets.extend(cost_sheets)
required_sheets.extend(income_sheets)
required_sheets

['P3S01',
 'P3S02',
 'P3S03',
 'P3S04',
 'P3S05',
 'P3S06',
 'P3S07',
 'P3S08',
 'P3S09',
 'P3S10',
 'P3S11',
 'P3S12',
 'P3S13',
 'P4S01',
 'P4S02',
 'P4S03',
 'P4S04']

In [8]:
required_dfs = {}
for f, sheets in data_copy.items():
    required_dfs[f] = {}
    for sheet, df in sheets.items():
        for required_sheet in required_sheets:
            if required_sheet in sheet:
                required_dfs[f][required_sheet] = df
                required_dfs[f][required_sheet]["file"] = f

for f, sheets in required_dfs.items():
    for sheet, df in sheets.items():
        print(f)
        # if sheet == "Data":
        print(f"{sheet} : {df.shape}")
        print("-------------------------------------")

R1400
P3S01 : (509426, 8)
-------------------------------------
R1400
P3S02 : (3621, 8)
-------------------------------------
R1400
P3S03 : (26019, 5)
-------------------------------------
R1400
P3S04 : (69396, 6)
-------------------------------------
R1400
P3S05 : (87768, 5)
-------------------------------------
R1400
P3S06 : (29465, 5)
-------------------------------------
R1400
P3S07 : (32866, 5)
-------------------------------------
R1400
P3S08 : (34928, 5)
-------------------------------------
R1400
P3S09 : (9523, 5)
-------------------------------------
R1400
P3S10 : (0, 5)
-------------------------------------
R1400
P3S11 : (9206, 5)
-------------------------------------
R1400
P3S12 : (66456, 5)
-------------------------------------
R1400
P3S13 : (180585, 4)
-------------------------------------
R1400
P4S01 : (10731, 20)
-------------------------------------
R1400
P4S02 : (12194, 20)
-------------------------------------
R1400
P4S03 : (25097, 12)
--------------------------------

In [9]:
for sheets in required_dfs.values():
    try:
        sheets["P4S01"].drop(
            sheets["P4S01"][
                (sheets["P4S01"]["employed_w"] == 1) &
                (sheets["P4S01"]["income_w_y"] <= 0) &
                (sheets["P4S01"]["netincome_w_y"] <= 0)
            ].index,
            inplace=True,
        )
    except Exception as e:
        print(f"{f} ----> P4S01 ----> {e}")
    try:
        sheets["P4S02"].drop(
            sheets["P4S02"][
                (sheets["P4S02"]["employed_s"] == 1) &
                (sheets["P4S02"]["sale"] <= 0) &
                (sheets["P4S02"]["income_s_y"] <= 0)
            ].index,
            inplace=True,
        )
    except Exception as e:
        print(f"{f} ----> P4S02 ----> {e}")

    try:
        sheets["P4S03"].drop(
            sheets["P4S03"][
                (sheets["P4S03"]["income_pension"] < 0) |
                (sheets["P4S03"]["income_rent"] < 0) |
                (sheets["P4S03"]["income_interest"] < 0) |
                (sheets["P4S03"]["income_resale"] < 0) |
                (sheets["P4S03"]["income_transfer"] < 0)
            ].index,
            inplace=True,
        )
    except Exception as e:
        print(f"{f} ----> P4S03 ----> {e}")

U98 ----> P4S03 ----> '<' not supported between instances of 'str' and 'int'
U98 ----> P4S03 ----> '<' not supported between instances of 'str' and 'int'
U98 ----> P4S03 ----> '<' not supported between instances of 'str' and 'int'
U98 ----> P4S03 ----> '<' not supported between instances of 'str' and 'int'
U98 ----> P4S02 ----> '<=' not supported between instances of 'str' and 'int'
U98 ----> P4S02 ----> '<=' not supported between instances of 'str' and 'int'
U98 ----> P4S03 ----> '<' not supported between instances of 'str' and 'int'
U98 ----> P4S03 ----> '<' not supported between instances of 'str' and 'int'


In [10]:
overall_dfs = {}

for f in required_dfs.keys():
    for required_sheet in required_sheets:
        data = overall_dfs.get(required_sheet)
        if data is None:
            overall_dfs[required_sheet] = required_dfs[f][required_sheet]
        else:
            overall_dfs[required_sheet] = pd.concat(
                [overall_dfs[required_sheet], required_dfs[f][required_sheet]],
                axis=0,
            )

In [11]:
for k in overall_dfs.keys():
    print(f"{k} : {overall_dfs[k].shape}")

P3S01 : (4459722, 8)
P3S02 : (28325, 8)
P3S03 : (207116, 5)
P3S04 : (618958, 6)
P3S05 : (727275, 5)
P3S06 : (250916, 5)
P3S07 : (297230, 5)
P3S08 : (299383, 5)
P3S09 : (87297, 5)
P3S10 : (0, 5)
P3S11 : (94616, 5)
P3S12 : (576300, 5)
P3S13 : (1635316, 4)
P4S01 : (92252, 20)
P4S02 : (68086, 20)
P4S03 : (188752, 12)
P4S04 : (222685, 9)


In [12]:
for sheet, df in overall_dfs.items():
    print(f"duplicates of {sheet} : {df[df.duplicated(keep='first')].shape} -----> {df.shape}")

duplicates of P3S01 : (0, 8) -----> (4459722, 8)
duplicates of P3S02 : (0, 8) -----> (28325, 8)
duplicates of P3S03 : (0, 5) -----> (207116, 5)
duplicates of P3S04 : (0, 6) -----> (618958, 6)
duplicates of P3S05 : (0, 5) -----> (727275, 5)
duplicates of P3S06 : (0, 5) -----> (250916, 5)
duplicates of P3S07 : (0, 5) -----> (297230, 5)
duplicates of P3S08 : (0, 5) -----> (299383, 5)
duplicates of P3S09 : (0, 5) -----> (87297, 5)
duplicates of P3S10 : (0, 5) -----> (0, 5)
duplicates of P3S11 : (0, 5) -----> (94616, 5)
duplicates of P3S12 : (0, 5) -----> (576300, 5)
duplicates of P3S13 : (2586, 4) -----> (1635316, 4)
duplicates of P4S01 : (1, 20) -----> (92252, 20)
duplicates of P4S02 : (0, 20) -----> (68086, 20)
duplicates of P4S03 : (0, 12) -----> (188752, 12)
duplicates of P4S04 : (18, 9) -----> (222685, 9)


`P3S13`, `P4S01` and `P4S04` has duplicates and as they are low, we can drop them.

In [13]:
overall_dfs_copy = copy.deepcopy(overall_dfs)

In [14]:
overall_dfs["P4S04"][overall_dfs["P4S04"].duplicated(keep=False)].head(10)

Unnamed: 0,Address,member,subsidy_number,subsidy_month,subsidy,Fasl,year,DYCOL00,file
14360,22301631723,1,1.0,12.0,1260000.0,2,1401,,R1400
14361,22301631723,1,1.0,12.0,1260000.0,2,1401,,R1400
14097,12301278824,1,1.0,12.0,5460000.0,2,1401,,U1400
14098,12301278824,1,1.0,12.0,5460000.0,2,1401,,U1400
22901,12501305132,1,1.0,12.0,1260000.0,3,1401,,U1400
22902,12501305132,1,1.0,12.0,1260000.0,3,1401,,U1400
30022,12301276227,1,1.0,12.0,5460000.0,4,1401,,U1400
30023,12301276227,1,1.0,12.0,5460000.0,4,1401,,U1400
13665,20404431220,1,,12.0,,3,1399,,R99
13666,20404431220,1,,12.0,,3,1399,,R99


In [15]:
overall_dfs["P4S04"].drop_duplicates(inplace=True)

In [16]:
overall_dfs["P4S01"][overall_dfs["P4S01"].duplicated(keep=False)]

Unnamed: 0,Address,member,employed_w,ISCO_w,ISIC_w,status_w,hours_w,days_w,income_w_m,income_w_y,wage_w_m,wage_w_y,perk_w_m,perk_w_y,netincome_w_m,netincome_w_y,Fasl,year,DYCOL00,file
5452,20005389220,3,2,9314.0,41000.0,3,,,0,40000000,0.0,40000000.0,0.0,0.0,0,40000000,3,1399,,R99
5453,20005389220,3,2,9314.0,41000.0,3,,,0,40000000,0.0,40000000.0,0.0,0.0,0,40000000,3,1399,,R99


In [17]:
overall_dfs["P4S01"].drop_duplicates(inplace=True)

In [18]:
overall_dfs["P3S13"][overall_dfs["P3S13"].duplicated(keep=False)].head(10)

Unnamed: 0,Address,code,value,file
2733,20111403626,125111,285000,R1400
2734,20111403626,125111,285000,R1400
2891,20106401035,125111,285000,R1400
2892,20106401035,125111,285000,R1400
2908,20107401527,125111,285000,R1400
2909,20107401527,125111,285000,R1400
3004,20108401935,125111,285000,R1400
3005,20108401935,125111,285000,R1400
3134,20106401023,125111,285000,R1400
3135,20106401023,125111,285000,R1400


In [19]:
overall_dfs["P3S13"].drop_duplicates(inplace=True)

In [20]:
for sheet, df in overall_dfs.items():
    print(f"duplicates of {sheet} : {df[df.duplicated(keep='first')].shape} -----> {df.shape}")

duplicates of P3S01 : (0, 8) -----> (4459722, 8)
duplicates of P3S02 : (0, 8) -----> (28325, 8)
duplicates of P3S03 : (0, 5) -----> (207116, 5)
duplicates of P3S04 : (0, 6) -----> (618958, 6)
duplicates of P3S05 : (0, 5) -----> (727275, 5)
duplicates of P3S06 : (0, 5) -----> (250916, 5)
duplicates of P3S07 : (0, 5) -----> (297230, 5)
duplicates of P3S08 : (0, 5) -----> (299383, 5)
duplicates of P3S09 : (0, 5) -----> (87297, 5)
duplicates of P3S10 : (0, 5) -----> (0, 5)
duplicates of P3S11 : (0, 5) -----> (94616, 5)
duplicates of P3S12 : (0, 5) -----> (576300, 5)
duplicates of P3S13 : (0, 4) -----> (1632730, 4)
duplicates of P4S01 : (0, 20) -----> (92251, 20)
duplicates of P4S02 : (0, 20) -----> (68086, 20)
duplicates of P4S03 : (0, 12) -----> (188752, 12)
duplicates of P4S04 : (0, 9) -----> (222667, 9)


In [21]:
overall_dfs["P4S04"].head()

Unnamed: 0,Address,member,subsidy_number,subsidy_month,subsidy,Fasl,year,DYCOL00,file
0,20001384225,1,1.0,12.0,1960000.0,1,1401,,R1400
1,20001384225,2,2.0,12.0,10920000.0,1,1401,,R1400
2,20001384232,1,1.0,12.0,1960000.0,1,1401,,R1400
3,20001384232,2,2.0,12.0,10920000.0,1,1401,,R1400
4,20001384235,1,1.0,12.0,1960000.0,1,1401,,R1400


In [22]:
required_features = {
    "P3S" : ["address", "value", "file"],
    "P3S04" : ["address", "mortgage", "value", "file"],
    "P3S13" : ["address", "value", "file"],
    "P4S01" : ["address", "netincome_w_y", "file"],
    "P4S02" : ["address", "income_s_y", "file"],
    "P4S03" : ["address", "income_pension", "income_rent",
               "income_interest", "income_aid", "income_resale",
               "income_transfer", "file"],
    "P4S04" : ["address", "subsidy", "file"],
}

In [23]:
for sheet, df in overall_dfs.items():
    df.columns = df.columns.str.lower()

    if sheet.startswith("P3S") and sheet != "P3S04" and sheet != "P3S13":
        overall_dfs[sheet] = overall_dfs[sheet][required_features["P3S"]]
        numeric_cols = [col for col in required_features["P3S"] if col not in ["address", "file"]]
    else:
        overall_dfs[sheet] = overall_dfs[sheet][required_features[sheet]]
        numeric_cols = [col for col in required_features[sheet] if col not in ["address", "file"]]

    overall_dfs[sheet][numeric_cols] = overall_dfs[sheet][numeric_cols].apply(pd.to_numeric, errors="coerce")
    if sheet.startswith("P3S"):
        overall_dfs[sheet]["cost"] = overall_dfs[sheet][numeric_cols].sum(axis=1)
    elif sheet == "P4S04":
        pass
    elif sheet.startswith("P4S"):
        overall_dfs[sheet]["income"] = overall_dfs[sheet][numeric_cols].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  overall_dfs[sheet][numeric_cols] = overall_dfs[sheet][numeric_cols].apply(pd.to_numeric, errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  overall_dfs[sheet]["cost"] = overall_dfs[sheet][numeric_cols].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  overall_dfs[sheet][numeri

In [24]:
overall_dfs["P4S01"][overall_dfs["P4S01"]["netincome_w_y"] <= 0]

Unnamed: 0,address,netincome_w_y,file,income
1282,11401173035,0,U1401,0
1283,11401173035,0,U1401,0
1284,11401173429,0,U1401,0
1285,11401173435,0,U1401,0
1319,11402176638,0,U1401,0
1377,11408182226,0,U1401,0
1804,11907227838,0,U1401,0
2166,12301267135,0,U1401,0
2268,12301278338,0,U1401,0
4130,11104138441,0,U1401,0


In [25]:
costs = []
incomes = []
# subsid_df = overall_dfs.pop("P4S04")
for sheet, df in overall_dfs.items():
    if sheet.startswith("P3S"):
        costs.append(df[["address", "cost", "file"]])
    elif sheet == "P4S04":
        subsids_df = df.copy()
    elif sheet.startswith("P4S"):
        incomes.append(df[["address", "income", "file"]])

costs_df = pd.concat(costs).groupby(["address", "file"], as_index=False).sum("cost")
incomes_df = pd.concat(incomes).groupby(["address", "file"], as_index=False).sum("income")

In [26]:
costs_df.head(10)

Unnamed: 0,address,file,cost
0,10001000108,U98,152193000.0
1,10001000111,U98,208140288.0
2,10001000113,U99,243967000.0
3,10001000114,U98,43402000.0
4,10001000116,U99,314822000.0
5,10001000117,U98,89418000.0
6,10001000119,U1400,192519000.0
7,10001000120,U1400,847235000.0
8,10001000120,U98,59097000.0
9,10001000120,U99,289044000.0


In [27]:
incomes_df.head(10)

Unnamed: 0,address,file,income
0,10001000108,U98,392000000.0
1,10001000111,U98,564000000.0
2,10001000113,U99,888000000.0
3,10001000114,U98,144000000.0
4,10001000116,U99,654000000.0
5,10001000117,U98,444000000.0
6,10001000119,U1400,905500000.0
7,10001000120,U1400,1373500000.0
8,10001000120,U98,333000000.0
9,10001000120,U99,976000000.0


In [28]:
subsids_df.head(10)

Unnamed: 0,address,subsidy,file
0,20001384225,1960000.0,R1400
1,20001384225,10920000.0,R1400
2,20001384232,1960000.0,R1400
3,20001384232,10920000.0,R1400
4,20001384235,1960000.0,R1400
5,20001384235,5460000.0,R1400
6,20011394620,1610000.0,R1400
7,20011394620,16380000.0,R1400
8,20011394623,1610000.0,R1400
9,20011394623,10920000.0,R1400


In [29]:
final_df = pd.merge(costs_df, incomes_df, on=["address", "file"])
final_df_with_subsid = pd.merge(final_df, subsids_df, on=["address", "file"])

In [30]:
final_df.head()

Unnamed: 0,address,file,cost,income
0,10001000108,U98,152193000.0,392000000.0
1,10001000111,U98,208140288.0,564000000.0
2,10001000113,U99,243967000.0,888000000.0
3,10001000114,U98,43402000.0,144000000.0
4,10001000116,U99,314822000.0,654000000.0


In [31]:
# final_df["sub"] = final_df["income"] - final_df["cost"]

In [32]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(final_df[["cost", "income"]])

In [33]:
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(x_scaled)
labels = kmeans.labels_
final_df["label"] = labels
centers = kmeans.cluster_centers_

