# <h1 style="text-align: center; color:magenta"> Prediction Section </h1>

## <h2 style="text-align: left; color:cyan"> Data Inspection </h2>

In [1]:
import os
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from sklearn.feature_selection import mutual_info_regression

In [2]:
def empty_string_remover(df):
    for col in df.columns:
    # Checking if string dtype
        if is_string_dtype(df[col]):
            # replace cells with only empty strings with np.nan
            df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
    return df

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, n_neighbors=3, random_state=42)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def cat_target_relation(cat_df, col_name, year_df, agg_func="sum"):
    """
    input df will be grouped by addresses then merged with year_df

    """

    col_one_hot = pd.get_dummies(cat_df[col_name])
    col_one_hot_df = pd.concat((cat_df[["Address"]], col_one_hot), axis=1)
    grouped = (col_one_hot_df.groupby("Address").agg(agg_func)
                                .reset_index())
    
    df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=grouped, how="left", on="Address").fillna(0)
    
    mi_scores = make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), df.transportation_cost)
    print(mi_scores)

def family_bool_col_target_relation(df, col_name, year_df):
    """
    input df should be grouped by addresses it will be merged with year_df
    
    """
    df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=df[["Address", col_name]], how="left", on="Address")
    if df[col_name].dtype == "bool":
        df[col_name] = df[col_name].astype(float)
        df.fillna({col_name: 0}, inplace=True)    
    mi_scores = make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), year_df.transportation_cost)
    print(mi_scores)
  
    

In [3]:
# A dictionary of sheet description to better remember the purpose of each sheet.
sheets_desc = {"Data": "مشخصات پرسشنامه",
               "P1":"قسمت یکم: خصوصیات اجتماعی اعضای خانوار",
               "P2": "قسمت دوم: مشخصات محل سکونت",
               "P3S01": "قسمت سوم: بخش ۱ هزینه‌های خوراکی خانوار در ماه گذشته",
               "P3S02": "قسمت سوم: بخش ۲ هزینه‌های نوشیدنی‌های طبقه‌بندی نشده و دخانی های خانوار در ماه گذشته",
               "P3S03": "قسمت سوم: بخش ۳ هزینه‌های پوشاک و کفش خانوار در ماه گذشته",
               "P3S04": "قسمت سوم: بخش ۴ هزینه‌های بخش مسکن، آب، فاضلاب، سوخت و روشنایی خانوار در ماه گذشته",
               "P3S05": "قسمت سوم: بخش ۵ هزینه‌های مبلمان و لوازم خانگی",
               "P3S06": "قسمت سوم: بخش ۶ هزینه‌های بهداشتی و درمانی خانوار در ماه گذشته",
               "P3S07": "قسمت سوم: بخش ۷ هزینه‌های حمل و نقل خانوار در ماه گذشته",
               "P3S08": "قسمت سوم: بخش ۸ هزینه‌های ارتباطات خانوار در ماه گذشته",
               "P3S09": "قسمت سوم: بخش ۹ هزینه‌های خدمات فرهنگی و تفریحات خانوار در ماه گذشته",
               "P3S10": "قسمت سوم: بخش ۱۰ هزینه‌های آموزش و تحصیل",  # empty sheet, included in P3S13
               "P3S11": "قسمت سوم: بخش ۱۱ هزینه‌های غذاهای آماده، هتل و رستوران خانوار در ماه گذشته",
               "P3S12": "قسمت سوم: بخش ۱۲ هزینه‌های کالاها و خدمات متفرقه خانوار در ماه گذشته",
               "P3S13": "قسمت سوم: بخش ۱۳ سایر هزینه‌ها و انتقالات در ۱۲ ماه گذشته",
               "P3S14": "قسمت سوم: بخش ۱۴ سرمایه‌گذاری خانوار در ۱۲ ماه گذشته",
               "P4S01": "قسمت چهارم: بخش ۱ درآمد پولی اعضای شاغل خانوار از مشاغل مزد و حقوق بگیری",
               "P4S02": "قسمت چهارم: بخش ۲ درآمد پولی اعضای شاغل خانوار از مشاغل غیر مزد و حقوق بگیری (آزاد)",
               "P4S03": "قسمت چهارم: بخش ۳ درآمدهای متفرقه خانوار در ۱۲ ماه گذشته",
               "P4S04": "قسمت چهارم: ستون ۹ بخش ۳ شامل وام و یارانه"
               }

#### Reading the files

Looping through the data folder to get all the file names

In [4]:
resource_directory = "data"
resource_files : list[str] = []
for f in os.listdir(resource_directory):
    f_path = os.path.join(resource_directory, f)
    if os.path.isfile(f_path):
        resource_files.append(f)
del f, f_path

filename = "U99.xlsx"

file_path = os.path.join(resource_directory, filename)
year_file = pd.ExcelFile(file_path)

if filename.startswith("U"):
    Urban = True
else:
    Urban = False

year = int(filename.strip(".xlsx")[1:])
if (year==99) or (year ==98):
    year = 1300+year



In [5]:
sheet_index = 0
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df["Year"] = year
current_df["Urban"] = Urban
current_df = current_df.loc[current_df.TakmilDescA.isna() | 
                        current_df.TakmilDescB.isna() | 
                        current_df.JaygozinDescA.isna() | 
                        current_df.JaygozinDescB.isna() |
                        current_df.Jaygozin!=2, :]
if "JaygozinDescC" in current_df.columns:
    current_df = current_df.loc[current_df.JaygozinDescC.isna(), :]

# We only consider addresses from this dataframe and do left join with other dfs
year_df = current_df.loc[:, ["Address", "Urban", "Year", "Fasl", "weight", "province", "town"]].copy()

In [6]:
sheet_index = 9
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
transportation_cost_df = (current_df.groupby("Address").agg({"value": "sum"})
                    .rename(columns={"value": "transportation_cost"})
                    .reset_index())
year_df = pd.merge(year_df, transportation_cost_df, how="left", on="Address")
year_df.dropna(subset=["transportation_cost"], inplace=True)

In [7]:
sheet_index=1
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.", 
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.fillna({"studying": "No"}, inplace=True)
# Counting family members of each family
family_member_count = (current_df.groupby("Address")
                    .agg({"member": "count"})
                    .rename(columns={"member": "member_cnt"}).reset_index())
family_mean_age = (current_df.groupby("Address").agg({"age": "mean"})
                    .rename(columns={"age": "mean_age"})
                    .reset_index())




print("investigating relationship of one-hot encoded categorical columns with target variable (i.e transportation_cost)")
for col in current_df.select_dtypes((object)):
    cat_target_relation(current_df, col, year_df, "count")

# Not much correlation between one-hot encoded columns of categorical columns and target

Sheet name from sheets_desc dict: P1. Sheet name from file: R99P1
 Loaded sheet description: قسمت یکم: خصوصیات اجتماعی اعضای خانوار
investigating relationship of one-hot encoded categorical columns with target variable (i.e transportation_cost)
Child                0.049665
OtherRelative        0.045153
Head                 0.044494
Spouse               0.041862
Parent               0.041229
SonDaughter_inLaw    0.035111
GrandSonDaughter     0.033950
NonRelative          0.032145
Sibling              0.031958
Name: MI Scores, dtype: float64
Female    0.048906
Male      0.040460
Name: MI Scores, dtype: float64
illiterate    0.048906
literate      0.040460
Name: MI Scores, dtype: float64
No     0.048906
Yes    0.040460
Name: MI Scores, dtype: float64
Bachelor      0.049665
HighSchool    0.045153
Diploma       0.044494
Secondary     0.041862
Master        0.041229
PhD           0.035111
College       0.033950
Elemantry     0.032145
Other         0.031958
Name: MI Scores, dtype: float64
Ot

In [10]:
sheet_index=2
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.", 
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
for col in current_df.select_dtypes((bool)):
    family_bool_col_target_relation(current_df, col, year_df)
for col in current_df.select_dtypes((object)):
    cat_target_relation(current_df, col, year_df, "count")

# Best feature here is only maybe vehicle


Sheet name from sheets_desc dict: P2. Sheet name from file: R99P2
 Loaded sheet description: قسمت دوم: مشخصات محل سکونت
vehicle    0.244805
Name: MI Scores, dtype: float64
motorcycle    0.013095
Name: MI Scores, dtype: float64
bicycle    0
Name: MI Scores, dtype: int64
radio    0
Name: MI Scores, dtype: int64
radiotape    0.001817
Name: MI Scores, dtype: float64
TVbw    0
Name: MI Scores, dtype: int64
TV    0
Name: MI Scores, dtype: int64
VHS_VCD_DVD    0.002328
Name: MI Scores, dtype: float64
computer    0.033481
Name: MI Scores, dtype: float64
cellphone    0
Name: MI Scores, dtype: int64
freezer    0.004808
Name: MI Scores, dtype: float64
refridgerator    0.001827
Name: MI Scores, dtype: float64
fridge    0.004927
Name: MI Scores, dtype: float64
stove    0
Name: MI Scores, dtype: int64
vacuum    0.02049
Name: MI Scores, dtype: float64
washingmachine    0.033604
Name: MI Scores, dtype: float64
sewingmachine    0.010579
Name: MI Scores, dtype: float64
fan    0.003914
Name: MI Scores, d

In [11]:
sheet_index=3
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

Sheet name from sheets_desc dict: P3S01. Sheet name from file: U99P3S01
 Loaded sheet description: قسمت سوم: بخش ۱ هزینه‌های خوراکی خانوار در ماه گذشته


value    0.112302
Name: MI Scores, dtype: float64

In [12]:
sheet_index=4
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# No good feature here

Sheet name from sheets_desc dict: P3S02. Sheet name from file: U99P3S02
 Loaded sheet description: قسمت سوم: بخش ۲ هزینه‌های نوشیدنی‌های طبقه‌بندی نشده و دخانی های خانوار در ماه گذشته


value    0.001071
Name: MI Scores, dtype: float64

In [13]:
sheet_index=5
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# No good feature here

Sheet name from sheets_desc dict: P3S03. Sheet name from file: U99P3S03
 Loaded sheet description: قسمت سوم: بخش ۳ هزینه‌های پوشاک و کفش خانوار در ماه گذشته


value    0.036746
Name: MI Scores, dtype: float64

In [14]:
sheet_index=6
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# No good feature here

Sheet name from sheets_desc dict: P3S04. Sheet name from file: U99P3S04
 Loaded sheet description: قسمت سوم: بخش ۴ هزینه‌های بخش مسکن، آب، فاضلاب، سوخت و روشنایی خانوار در ماه گذشته


value    0.062095
Name: MI Scores, dtype: float64

In [15]:
sheet_index=7
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# No good feature here

Sheet name from sheets_desc dict: P3S05. Sheet name from file: U99P3S05
 Loaded sheet description: قسمت سوم: بخش ۵ هزینه‌های مبلمان و لوازم خانگی


value    0.060912
Name: MI Scores, dtype: float64

In [16]:
sheet_index=8
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# No good feature here

Sheet name from sheets_desc dict: P3S06. Sheet name from file: U99P3S06
 Loaded sheet description: قسمت سوم: بخش ۶ هزینه‌های بهداشتی و درمانی خانوار در ماه گذشته


value    0.02906
Name: MI Scores, dtype: float64

In [17]:
sheet_index=11
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# No good feature here

Sheet name from sheets_desc dict: P3S09. Sheet name from file: U99P3S09
 Loaded sheet description: قسمت سوم: بخش ۹ هزینه‌های خدمات فرهنگی و تفریحات خانوار در ماه گذشته


value    0.010085
Name: MI Scores, dtype: float64

In [18]:
sheet_index=13
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])
# No good feature here

Sheet name from sheets_desc dict: P3S11. Sheet name from file: U99P3S11
 Loaded sheet description: قسمت سوم: بخش ۱۱ هزینه‌های غذاهای آماده، هتل و رستوران خانوار در ماه گذشته


value    0.037606
Name: MI Scores, dtype: float64

In [19]:
sheet_index=14
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])
# No good feature here

Sheet name from sheets_desc dict: P3S12. Sheet name from file: U99P3S12
 Loaded sheet description: قسمت سوم: بخش ۱۲ هزینه‌های کالاها و خدمات متفرقه خانوار در ماه گذشته


value    0.094682
Name: MI Scores, dtype: float64

In [20]:
sheet_index=15
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])


Sheet name from sheets_desc dict: P3S13. Sheet name from file: U99P3S13
 Loaded sheet description: قسمت سوم: بخش ۱۳ سایر هزینه‌ها و انتقالات در ۱۲ ماه گذشته


value    0.155787
Name: MI Scores, dtype: float64

In [21]:
sheet_index=16
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
make_mi_scores(
    pd.merge(left=year_df, 
             right=current_df.groupby("Address").agg({"value": "sum"}).reset_index(), 
             on="Address", how="left").fillna(0).loc[:, ["value"]]
    , year_df["transportation_cost"])

# no good feature here

Sheet name from sheets_desc dict: P3S14. Sheet name from file: U99P3S14
 Loaded sheet description: قسمت سوم: بخش ۱۴ سرمایه‌گذاری خانوار در ۱۲ ماه گذشته


value    0.005844
Name: MI Scores, dtype: float64

In [47]:
sheet_index=17
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)

col = "employed_w"
temp_df = current_df.copy()
temp_df[col] = temp_df[col].replace({2:0})
grouped = (temp_df.groupby("Address").agg({col: "sum"})
                        .reset_index())

df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=grouped, how="left", on="Address")
df.fillna({col:0}, inplace=True)

print(make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), year_df.transportation_cost))

col = "netincome_w_m"
temp_df = current_df.copy()
grouped = (temp_df.groupby("Address").agg({col: "sum"})
                        .reset_index())

df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=grouped, how="left", on="Address")
df.fillna({col:0}, inplace=True)

print(make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), year_df.transportation_cost))

# no good features here

Sheet name from sheets_desc dict: P4S01. Sheet name from file: U99P4S01
 Loaded sheet description: قسمت چهارم: بخش ۱ درآمد پولی اعضای شاغل خانوار از مشاغل مزد و حقوق بگیری
employed_w    0.004972
Name: MI Scores, dtype: float64
netincome_w_m    0.062906
Name: MI Scores, dtype: float64


In [60]:
sheet_index=18
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)

col = "income_s_y"
temp_df = current_df.copy()
grouped = (temp_df.groupby("Address").agg({col: "sum"})
                        .reset_index())

df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=grouped, how="left", on="Address")
df.fillna({col:0}, inplace=True)

print(make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), year_df.transportation_cost))
# no good features here, all columns were investigated

Sheet name from sheets_desc dict: P4S02. Sheet name from file: U99P4S02
 Loaded sheet description: قسمت چهارم: بخش ۲ درآمد پولی اعضای شاغل خانوار از مشاغل غیر مزد و حقوق بگیری (آزاد)
income_s_y    0.009299
Name: MI Scores, dtype: float64


In [8]:
sheet_index=19
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
col = "income_aid"
temp_df = current_df.copy()
temp_df[col] = pd.to_numeric(temp_df[col], errors="coerce").fillna(0)
grouped = (temp_df.groupby("Address").agg({col: "sum"})
                        .reset_index())

df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=grouped, how="left", on="Address")
df.fillna({col:0}, inplace=True)

print(make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), year_df.transportation_cost))
# no good features here, all columns were investigated

Sheet name from sheets_desc dict: P4S03. Sheet name from file: U99P4S03
 Loaded sheet description: قسمت چهارم: بخش ۳ درآمدهای متفرقه خانوار در ۱۲ ماه گذشته
income_aid    0.033119
Name: MI Scores, dtype: float64


In [69]:

sheet_index=20
sheet_name = list(sheets_desc.keys())[sheet_index]
print(f"Sheet name from sheets_desc dict: {sheet_name}.",
    f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
    f"Loaded sheet description: {sheets_desc[sheet_name]}")
current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
current_df = empty_string_remover(current_df)
col = "subsidy"
temp_df = current_df.copy()
temp_df[col] = pd.to_numeric(temp_df[col], errors="coerce").fillna(0)
grouped = (temp_df.groupby("Address").agg({col: "sum"})
                        .reset_index())

df = pd.merge(left=year_df[["Address", "transportation_cost"]], right=grouped, how="left", on="Address")
df.fillna({col:0}, inplace=True)

print(make_mi_scores(df.drop(["Address", "transportation_cost"], axis=1), year_df.transportation_cost))
# no good features here

Sheet name from sheets_desc dict: P4S04. Sheet name from file: U99P4S04
 Loaded sheet description: قسمت چهارم: ستون ۹ بخش ۳ شامل وام و یارانه
subsidy    0.052957
Name: MI Scores, dtype: float64
