In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler,OrdinalEncoder,OneHotEncoder


scaler_minmax= MinMaxScaler()                                           # Create a MinMaxScaler object
scaler_standered=StandardScaler()                                       # Create a StandardScaler object
oe=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) # Create a OrdenalEncoder object
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')       # Create a OneHotEncoder object

# Read CSV files

df=pd.read_csv("Training_Set_Values.csv")    # Read the CSV file
name_featrures=df.columns                    # Get the features name
len_features=len(name_featrures)                # Get the length of features
labels=pd.read_csv("Training_Set_Labels.csv") # Read the labels CSV file
labels.head()
df['target'] = labels['status_group']        # Add the target column to the dataframe
print(df.shape)                              # Print the shape of the dataframe
#df.head()
#df.info()


In [None]:
# Columns with Nan values
nan_columns = df.columns[df.isnull().any()].tolist()           # Get the columns with NaN values
print("Columns with NaN values: ", nan_columns)                # Print the columns names with NaN values
print("Number of columns with NaN values: ", len(nan_columns)) # Print the number of columns with NaN values
#df[nan_columns].head()                                         # Print the first 5 rows of the columns with NaN values

In [None]:
# Exploration of NaN Columns
for col in nan_columns:                                             # Loop through the columns with NaN values
    print("--------------------------------------------------")     # Print a separator line
    print("Name:",col)                                              # Print the column name
    print("Number of NaN:",df[col].isnull().sum())                  # Print the number of NaN values in the column
    print("Percentage of NaN:", df[col].isnull().sum()/len(df)*100) # Print the percentage of NaN values in the column
    print(df[col].value_counts())                                   # Print the value counts of the column


In [None]:
# Column 01 amount_tsh (Static Head)  ## 50 percent values are zero....I think Drop this column
print('Nan count in amount_tsh',df["amount_tsh"].isnull().sum()) # Number of NaN values in the column

print(df["amount_tsh"].describe())
print("median: ", df["amount_tsh"].median()) # Median is zero...means half of the points are zero (since points are positive)
plt.figure(figsize=(8,5))
sns.histplot(df["amount_tsh"], bins=200, kde=True)
plt.xlabel("Amount TSH (Static Head)")
plt.ylabel("Frequency")
plt.title("Histogram of Static Head")
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(df["amount_tsh"])
sns.stripplot(df["amount_tsh"],color="red",alpha=0.5)
plt.ylim(0, 100) 
plt.show()




In [None]:

# Column 02 date_recorded (Date when the recoed entered)
print("date_recorded")
print("NaN value count:",df["date_recorded"].isnull().sum()) # Check if there are any null values in the column
df['date_recorded'] = pd.to_datetime(df['date_recorded']).dt.year# Convert to datetime format and extract year
print(df['date_recorded'].value_counts()) # Print the value counts of the column
df = df[~df['date_recorded'].isin([2002, 2004])] # Remove the record for years 2002 and 2004 from the dataframe.
                                             #31 records will be droped

df['date_recorded']=oe.fit_transform(df['date_recorded'].values.reshape(-1, 1)) # Fit and transform the column using OrdinalEncoder
print('After Transformatiom',df['date_recorded'].value_counts()) # Print the value counts of the column



In [None]:
# Column 03 funder # Not Clear how to handle this column
print("NaN value count in funder:",df["funder"].isnull().sum()) # Check if there are any null values in the column
print('Unique values:',df['funder'].nunique()) # Print the number of unique values in the column
print(df['funder'].describe()) # Print the value counts of the column


In [None]:
# Column 04 gps_height (GPS height) # The column needs to be considered as data is ok, minmax normalization is used
print("--------------------------------")
print("gps_height")
print("NaN value count:",df["gps_height"].isnull().sum())                # Check if there are any null values in the column
print("Percentage of NaN:", df["gps_height"].isnull().sum()/len(df)*100) # Print the percentage of NaN values in the column
print(df["gps_height"].describe())                                       # Print the description of the column
print("Median:", df["gps_height"].median())                              # Print the median of the column
plt.figure(figsize=(8,5))
sns.histplot(df["gps_height"], bins=200, kde=True)                       # Plot the histogram of the column
plt.xlabel("GPS Height")
plt.ylabel("Frequency")
plt.title("Histogram of GPS Height")
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(y=df["gps_height"])
sns.stripplot(y=df["gps_height"], color="red", alpha=0.5)                # Adds all points
plt.title("Boxplot of GPS Height Before Normalization")
plt.show() # Show the plot



df["gps_height"]=scaler_minmax.fit_transform(df["gps_height"].values.reshape(-1,1)) # Fit and transform the data using MinMaxScaler
df["gps_height"] = np.clip(df["gps_height"], 0, 1) # To ensure that the values are between 0 and 1, in case the testdata has values outside the range of training data
plt.figure(figsize=(8,5))
sns.boxplot(df["gps_height"])
sns.stripplot(df["gps_height"], color="red", alpha=0.5)  # Adds all points
plt.ylabel("GPS Height (MinMax Normalized)")
plt.title("Boxplot of GPS Height After Normalization")
plt.show() # Show the plot



In [None]:
# Column 05 installer # Not Clear how to handle this column
print("NaN value count in installer:",df["installer"].isnull().sum()) # Check if there are any null values in the column
print('Unique values:',df['installer'].nunique()) # Print the number of unique values in the column
print(df['installer'].describe()) # Print the value counts of the column
#print(df['installer'].value_counts()) # Print the value counts of the column
pd.crosstab(df['installer'],df['target']).head() # Cross tabulation of installer and status_group


In [None]:
#Columns 06 longitude (GPS Coordinates) # The column needs to be considered as data is ok,outlier replaced with medain and minmax normalization
print("--------------------------------")
print("longitude")
print("NaN value count:",df["longitude"].isnull().sum())                # Check if there are any null values in the column
print(df["longitude"].describe())                                                  # Print the description of the column
print("Median:", df["longitude"].median())                              # Print the median of the column

plt.figure(figsize=(8,5))
sns.histplot(df["longitude"], bins=200, kde=True)               # Plot the histogram of the column
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(df["longitude"])
sns.stripplot(df["longitude"], color="red", alpha=0.5)                  # Adds all points
plt.title("Boxplot of Longitude Before Outlier Removal")
plt.show() # Show the plot

# Remove the outliers from the longitude column
Q1=df["longitude"].quantile(0.25) # 25th percentile
Q3=df["longitude"].quantile(0.75) # 75th percentile
IQR=Q3-Q1 # Interquartile range
Lower_bound_longitude=Q1-1.5*IQR # Lower bound
Upper_bound_longitude=Q3+1.5*IQR # Upper bound
median_longitude=df["longitude"].median() # Median value

df["longitude_outlier_replaced_median"]=df["longitude"].apply(lambda x:x if ((x>=Lower_bound_longitude) &(x<=Upper_bound_longitude)) else median_longitude) # Replace outliers with median value


plt.figure(figsize=(8,5))
sns.boxplot(df["longitude"])
sns.stripplot(df["longitude_outlier_replaced_median"], color="red", alpha=0.5)                  # Adds all points
plt.title("Boxplot of Longitude After Outlier Removal With Median")
plt.show() # Show the plot


df["longitude_outlier_replaced_median_minmax_normalized"]=scaler_minmax.fit_transform(df["longitude_outlier_replaced_median"].values.reshape(-1,1)) # Fit and transform the data using MinMaxScaler
df["longitude_outlier_replaced_median_minmax_normalized"] = np.clip(df["longitude_outlier_replaced_median_minmax_normalized"], 0, 1) # To ensure that the values are between 0 and 1, in case the testdata has values outside the range of training data


plt.figure(figsize=(8,5))
sns.boxplot(df["longitude_outlier_replaced_median_minmax_normalized"])
sns.stripplot(df["longitude_outlier_replaced_median_minmax_normalized"], color="red", alpha=0.5)  # Adds all points
plt.ylabel("longitude (Outlier Replaced by Median and MinMax Normalized)")
plt.title("Boxplot of GPS longitude After Outlier Removal and Normalization")
plt.show() # Show the plot


df["longitude"] = df["longitude_outlier_replaced_median_minmax_normalized"] # Replace original longitude with processed values

df.drop("longitude_outlier_replaced_median", axis=1, inplace=True) # Drop the intermediate column
df.drop("longitude_outlier_replaced_median_minmax_normalized", axis=1, inplace=True) # Drop the intermediate column


df.columns




In [None]:
#Columns 07 latitude (GPS Coordinates) # The column needs to be considered as data is ok
print("--------------------------------")
print("latitude")
print("NaN value count:",df["latitude"].isnull().sum())                # Check if there are any null values in the column
print(df["latitude"].describe())                                                  # Print the description of the column
print("Median:", df["latitude"].median())                              # Print the median of the column

plt.figure(figsize=(8,5))
sns.histplot(df["latitude"], bins=200, kde=True)               # Plot the histogram of the column
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(df["latitude"])
sns.stripplot(df["latitude"], color="red", alpha=0.5)                  # Adds all points
plt.title("Boxplot of latitude")
plt.show() # Show the plot


df["latitude_minmax_normalized"]=scaler_minmax.fit_transform(df["latitude"].values.reshape(-1,1)) # Fit and transform the data using MinMaxScaler
df["latitude_minmax_normalized"] = np.clip(df["latitude_minmax_normalized"], 0, 1) # To ensure that the values are between 0 and 1, in case the testdata has values outside the range of training data


plt.figure(figsize=(8,5))
sns.boxplot(df["latitude_minmax_normalized"])
sns.stripplot(df["latitude_minmax_normalized"], color="red", alpha=0.5)  # Adds all points
plt.ylabel("Latitude MinMax Normalized)")
plt.title("Boxplot of Latitude After Normalization")
plt.show() # Show the plot

df["latitude"] = df["latitude_minmax_normalized"] # Replace original longitude with processed values
df.drop("latitude_minmax_normalized", axis=1, inplace=True) # Drop the intermediate column
df.columns



In [None]:
# Column 08 wpt_name (Waterpoint Name)  # Not Clear how to handle this column
print("NaN value count in wpt_name:",df["wpt_name"].isnull().sum()) # Check if there are any null values in the column
df['wpt_name'].value_counts() # Print the value counts of the column


In [None]:
# Column 09 num_private # Since 75% values are zero better to drop this column
print("--------------------------------")
print(df["num_private"].describe()) # Print the description of the column
df["num_private"].isnull().sum() # Check if there are any null values in the column
plt.figure(figsize=(8,5))
sns.histplot(df["num_private"], bins=200,kde=True) # Plot the histogram of the column
plt.title("Histogram of num_private")
plt.xlabel("num_private")
plt.ylabel("Frequency")
plt.show()
plt.figure(figsize=(8,5))
plt.scatter(df.index, df['num_private'], color='blue', label='Values')
plt.xlabel('Index')
plt.ylabel('Values')
plt.title("num_private vs Index")
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(df["num_private"])
#sns.stripplot(df["num_private"], color="red", alpha=0.5)                  # Adds all points
plt.title("Boxplot of num_private")
# Labels and title
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Values vs Index')
plt.show()

In [None]:
# Column 10 basin # The column needs to be considered as data is ok..I used One Hot Encoding for this
print("NaN value count:",df["basin"].isnull().sum())                # Check if there are any null values in the column
df["basin"].describe() # Print the value counts of the column
print(df.basin.value_counts()) # Print the value counts of the column
print(df["basin"].head())
encoded_basin = ohe.fit_transform(df[['basin']])
encoded_basin_df = pd.DataFrame(encoded_basin, columns=ohe.get_feature_names_out(['basin']))
print(encoded_basin_df.head(5)) # Print the first 5 rows of the encoded dataframe
df = pd.concat([df, encoded_basin_df], axis=1) # Concatenate the original dataframe with the encoded dataframe
df.drop(columns=['basin'], inplace=True) # Drop the original column
df.head()





In [1041]:
# Column 11 subvillage # Since region code and district code gives same info 
                    # so ignore this because it has NaN and string to number 
                    # conversion is needed 
print("NaN count:",df["subvillage"].isnull().sum()) # Check if there are any null values in the column
print(df["subvillage"].describe()) # Print the value counts of the column
df["subvillage"].value_counts() # Print the value counts of the column
df["subvillage"].head()

NaN count: 402
count        58998
unique       19281
top       Madukani
freq           508
Name: subvillage, dtype: object


0      Mnyusi B
1       Nyamara
2       Majengo
3    Mahakamani
4    Kyanyamisa
Name: subvillage, dtype: object

In [1062]:
df[pd.isna(df["region"])][['subvillage','region','region_code','district_code','lga','ward']]# Print the first 10 rows of the dataframe where region is Kigoma

Unnamed: 0,subvillage,region,region_code,district_code,lga,ward
762,,,,,,
1189,,,,,,
2601,,,,,,
3446,,,,,,
8729,,,,,,
8736,,,,,,
10441,,,,,,
13366,,,,,,
14697,,,,,,
15103,,,,,,


In [None]:
# Column 12 region # Since region code and district code gives same info 
                    # so ignore this because string to number 
                    # conversion is needed
print("NaN count:", df["region"].isnull().sum())           # Check if there are any null values in the column
print(df["region"].describe())                             # Print the description of the column
df["region"].value_counts()                                # Print the value counts of the column
freq_encoding = df['region'].value_counts(normalize=True)  # Frequency Encoding for region column
df['region'] = df['region'].map(freq_encoding)             # Map the frequencies to the original column

print(df['region'].head())

NaN count: 31
count      59369
unique        21
top       Iringa
freq        5293
Name: region, dtype: object
0    0.089154
1    0.033132
2    0.026664
3    0.029072
4    0.055787
Name: region, dtype: float64


In [1056]:
#Column region_code # Since it has 27 unique codes i chose to use normalized frequncy encoding.
                    # Since frequncy of all codes are not balanced therefore frequency encoding
                    # may result in a biased model..I think solution is to use the distric_code column as well
print("NaN count",df["region_code"].isnull().sum()) # Check if there are any null values in the column
print("Unique values:",df["region_code"].nunique()) # Print the distinct values of the column
print(df["region_code"].describe()) # Print the value counts of the column
print("code vs freq",df["region_code"].value_counts()) # Print the value counts of the column
df["region_code"].value_counts() # Print the first 5 rows of the column
freq_encoding=df["region_code"].value_counts(normalize=True) # Frequency Encoding for region_code column
df["region_code"]=df["region_code"].map(freq_encoding) # Map the frequencies to the original column
df['region_code'].head() # Print the first 5 rows of the column

NaN count 31
Unique values: 27
count    59369.000000
mean        15.290977
std         17.578962
min          1.000000
25%          5.000000
50%         12.000000
75%         17.000000
max         99.000000
Name: region_code, dtype: float64
code vs freq region_code
11.0    5299
17.0    5007
12.0    4637
3.0     4379
5.0     4039
18.0    3320
19.0    3042
2.0     3024
16.0    2816
10.0    2640
4.0     2509
1.0     2201
13.0    2093
14.0    1979
20.0    1967
15.0    1807
6.0     1608
21.0    1583
80.0    1238
60.0    1023
90.0     913
7.0      805
99.0     423
9.0      390
24.0     326
8.0      300
40.0       1
Name: count, dtype: int64


0    0.089255
1    0.033132
2    0.026664
3    0.015378
4    0.055921
Name: region_code, dtype: float64

In [1057]:
# Column district_code # Since has 20 unique codes i chose to use normalized frequncy encoding.
                    # Since frequncy of all codes are not balanced therefore frequency encoding
                    # may result in a biased model..I think solution is to use the region_code column as well
print("NaN count",df["district_code"].isnull().sum()) # Check if there are any null values in the column
print("Unique values:",df["district_code"].nunique()) # Print the distinct values of the column

print(df["district_code"].describe()) # Print the value counts of the column
print("code vs freq",df["district_code"].value_counts()) # Print the value counts of the column
df["district_code"].head() # Print the first 5 rows of the column




NaN count 31
Unique values: 20
count    59369.000000
mean         5.627196
std          9.631542
min          0.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         80.000000
Name: district_code, dtype: float64
code vs freq district_code
1.0     12199
2.0     11169
3.0      9995
4.0      8994
5.0      4355
6.0      4074
7.0      3339
8.0      1040
30.0      994
33.0      868
53.0      745
43.0      505
13.0      391
23.0      293
63.0      195
62.0      109
60.0       63
0.0        23
80.0       12
67.0        6
Name: count, dtype: int64


0     5.0
1     2.0
2     4.0
3    63.0
4     1.0
Name: district_code, dtype: float64

In [1058]:
# Column lga # No NaN, 125 unique values, used normalized frequency encoding
print("NaN count:",df["lga"].isnull().sum())# Print the value counts of the column
print("Unique values:",df["lga"].nunique())# Print the value counts of the column
print(df["lga"].describe()) # Print the description of the column
print(df["lga"].head()) # Print the description of the column
freq_encoding=df["lga"].value_counts(normalize=True) # Frequency Encoding for lga column
df["lga"]=df["lga"].map(freq_encoding) # Map the frequencies to the original column
df["lga"].head()


NaN count: 31
Unique values: 125
count      59369
unique       125
top       Njombe
freq        2502
Name: lga, dtype: object
0       Ludewa
1    Serengeti
2    Simanjiro
3     Nanyumbu
4      Karagwe
Name: lga, dtype: object


0    0.009500
1    0.012026
2    0.005188
3    0.002661
4    0.012987
Name: lga, dtype: float64

In [1059]:
# Column ward # No NaN, 2092 unique values, used normalized frequency encoding
print("NaN count:",df["ward"].isnull().sum())# Print the value counts of the column
print("Unique values:",df["ward"].nunique())# Print the value counts of the column
print(df["ward"].describe()) # Print the description of the column
#print(df["ward"].head()) # Print the description of the column
freq_encoding=df["ward"].value_counts(normalize=True) # Frequency Encoding for ward column
df["ward_freq"]=df["ward"].map(freq_encoding) # Map the frequencies to the original column
df["ward"].head()


NaN count: 31
Unique values: 2092
count     59369
unique     2092
top       Igosi
freq        307
Name: ward, dtype: object


0      Mundindi
1         Natta
2       Ngorika
3      Nanyumbu
4    Nyakasimbi
Name: ward, dtype: object

In [None]:
# Column population
print('NaN count:',df['population'].isnull().sum()) # Check if there are any null values in the column
#print('Unique values',df['population'].value_counts()) # Print the value counts of the column
print('Median',df['population'].median()) # Print the median of the column
print(df['population'].describe())
plt.figure(figsize=(8,5))
sns.histplot(df["population"], bins=200, kde=True) # Plot the histogram of the column
plt.figure(figsize=(8,5))
sns.boxplot(df["population"])
sns.stripplot(df["population"], color="red", alpha=0.5)                  # Adds all points
plt.ylim(-100,3000)
plt.title("Boxplot of Population Before Outlier Removal")
plt.show() # Show the plot
plt.figure(figsize=(8,5))
plt.scatter(df.index, df['population'], color='blue', label='Values')
plt.show() # Show the plot

# Remove the outliers from the population column
Q1=df["population"].quantile(0.25) # 25th percentile
Q3=df["population"].quantile(0.75) # 75th percentile
IQR=Q3-Q1 # Interquartile range
Lower_bound_population=Q1-1.5*IQR # Lower bound
Upper_bound_population=Q3+1.5*IQR # Upper bound

df["population_outlier_replaced_NaN"]=df["population"].apply(lambda x:x if ((x>=Lower_bound_population) & (x<=Upper_bound_population)) else np.nan) # Replace outliers with median value

print("LB:",Lower_bound_population)
print("UB:",Upper_bound_population)


plt.figure(figsize=(8,5))
sns.boxplot(y=df["population"])
sns.stripplot(y=df["population_outlier_replaced_NaN"], color="red", alpha=0.5)
plt.ylim(-100,3000)
plt.title("Boxplot of Population After Outlier Removal With NaN")
plt.show() # Show the plot

print('Nan count in population_outlier_replaced_NaN:',df['population_outlier_replaced_NaN'].isnull().sum()) # Check if there are any null values in the column
df['population_outlier_replaced_mean'] = df.groupby('ward')['population_outlier_replaced_NaN'].transform(
    lambda x: x.fillna(x.mean())
)
print("NaN count after imputaion: ",df['population_outlier_replaced_mean'].isnull().sum()) # Check if there are any null values in the column






In [None]:
#Column public_meeting # 3334 NaN values replaced with string Unknown, One Hot Encoding used
print(df['public_meeting'].value_counts())                   # Print the value counts of the column
df['public_meeting'].isnull().sum()                          # Check if there are any null values in the column)
df['public_meeting']=df['public_meeting'].fillna('Unknown')  # Fill NaN values with 'Unknown'
print(df['public_meeting'].value_counts())                   # Print the value counts of the column
df = pd.get_dummies(df, columns=["public_meeting"], prefix="public_meeting", dtype=int)# Convert categorical variable into dummy/indicator variables
df.drop(columns=["public_meeting_Unknown"], inplace=True)    # Drop the first column to avoid dummy variable trap
#print(df[['public_meeting_True','public_meeting_False']].head(5))

In [None]:
# Column recorded_by # All values are "GeoData Consultants Ltd" ignore this column
print('Nan count:',df['recorded_by'].isnull().sum()) # Check if there are any null values in the column
df['recorded_by'].describe() # Print the value counts of the column
print('Unique values:',df['recorded_by'].nunique()) # Print the value counts of the column
print(df['recorded_by'].describe())
df.columns


In [None]:
# Column scheme_management # could not decide about this column..drop/keep
print('Nan count',df['scheme_management'].isnull().sum())
#print(df['scheme_name'].nunique())
print(df['scheme_management'].value_counts())
#pd.crosstab(df['scheme_name'],df['scheme_management'])
pd.set_option("display.max_rows", None)
print(df[df['scheme_management'].isna()][['scheme_management','scheme_name']]) 

In [None]:
#Column scheme_name # Nan count is 28810..almost 50 percemt so ignore the column
print('Nan count:',df['scheme_name'].isnull().sum()) # Check if there are any null values in the column
df['scheme_name'].describe() # Print the value counts of the column

In [None]:
# Column permit # NaN replaced with unknown and 1 hot encdoing is used
print('Nan count:',df['permit'].isnull().sum()) # Check if there are any null values in the column
print(df['permit'].describe())
print('value count:',df['permit'].value_counts())
df['permit']=df['permit'].fillna('Unknown')
print('Nan count:',df['permit'].isnull().sum()) # Check if there are any null values in the column
print('value count:',df['permit'].value_counts())
df=pd.get_dummies(df,columns=['permit'],prefix='permit',dtype=int)
df.drop(columns=['permit_Unknown'],inplace=True) # Drop the first column to avoid dummy variable trap
df[['permit_False', 'permit_True']].head()

In [None]:
#Column construction_year # 1/3 values are zero..dont know yet how to deal with it
print('Nan count:',df['construction_year'].isnull().sum()) # Check if there are any null values in the column
print(df['construction_year'].value_counts()) # Print the value counts of the column
print('unique values:',df['construction_year'].nunique()) # Print the value counts of the column

In [None]:
# Columns [extraction_type, extraction_type_group, extraction_type_class]
# mismatch b/w extraction type and group 2469
# outof 2469, the count for mismatch rows between group and class is: 486
# Based on anlysis I decided to drop two columns [extraction_type, extraction_type_class]
# use column ['extraction_type_group']
# Since 'extraction_type_group' has 0 NaN values and 13 unique values, use one hot encoding

print('mismatch b/w extraction type and group',(df['extraction_type'] != df['extraction_type_group']).sum()) # Check if there are any mismatch values in the type and group columns
print('mismatch b/w extraction type and class',(df['extraction_type'] != df['extraction_type_class']).sum()) # Check if there are any mismatch values in the type and class column
print('mismatch b/w extraction group and class',(df['extraction_type_group'] != df['extraction_type_class']).sum()) # Check if there are any mismatch values in the group and class column
print('--------------------------------------------------------')
a=df[df['extraction_type'] != df['extraction_type_group']][['extraction_type_group','extraction_type_class']]
print("Out of 2469, the count for mismatch rows between group and class is:", (a['extraction_type_group'] != a['extraction_type_class']).sum())
print('--------------------------------------------------------')
print('Nan count',df['extraction_type_group'].isnull().sum())                                               # Check if there are any null values in the column
print(df['extraction_type_group'].value_counts())                                                           # Print the value counts of the column
print('Unique values:',df['extraction_type_group'].nunique())                                               # Print the value counts of the column
print('--------------------------------------------------------')
df=pd.get_dummies(df,columns=['extraction_type_group'],prefix='extraction_type_group',dtype=int)            # Convert categorical variable into dummy/indicator variables
df.drop(columns=['extraction_type_group_afridev'],inplace=True)                                             # Drop the first column to avoid dummy variable trap
"""
df[['extraction_type_group_gravity', 'extraction_type_group_india mark ii',
       'extraction_type_group_india mark iii', 'extraction_type_group_mono',
       'extraction_type_group_nira/tanira', 'extraction_type_group_other',
       'extraction_type_group_other handpump',
       'extraction_type_group_other motorpump',
       'extraction_type_group_rope pump', 'extraction_type_group_submersible',
       'extraction_type_group_swn 80', 'extraction_type_group_wind-powered']].head()
"""

In [None]:
# Column ['management', 'management_group'] # No NaN Values.
# 12 unique values in management
# 5 unique values in management_group
# Used one hot encoding for both columns
df[['management', 'management_group']].head()
df['management'].nunique()                                                                      # Print the number of unique values in the column
pd.crosstab(df['management'],df['management_group'])                                            # Print the cross tabulation of management and management_group columns
pd.crosstab(df['management'],df['target'])                                                      # Print the cross tabulation of management and target columns
pd.crosstab(df['management_group'],df['target'])                                                # Print the cross tabulation of management_group and target columns
print('--------------------------------------------------------')
print('Nan count in management column',df['management'].isnull().sum())                         # Check if there are any null values in the column
print('value_counts in management column',df['management'].value_counts())                      # Print the value counts of the column
print('Unique values in management column',df['management'].nunique())                          # Print the number of unique values in the column
print('--------------------------------------------------------')
print('Nan count in management_group column',df['management_group'].isnull().sum())             # Check if there are any null values in the column
print('value_counts in management_group column',df['management_group'].value_counts())          # Print the value counts of the column
print('Unique values in management_group column',df['management_group'].nunique())              # Print the number of unique values in the column
print('--------------------------------------------------------')
df=pd.get_dummies(df,columns=['management'],prefix='management',dtype=int)                      # Convert categorical variable into dummy/indicator variables
df=pd.get_dummies(df,columns=['management_group'],prefix='management_group',dtype=int)          # Convert categorical variable into dummy/indicator variables

df.drop(columns=['management_unknown'],inplace=True)                                           # Drop the first column to avoid dummy variable trap
df.drop(columns=['management_group_unknown'],inplace=True)                                     # Drop the first column to avoid dummy variable trap

"""
df[[   'management_company', 
       'management_other', 
       'management_other - school',
       'management_parastatal', 'management_private operator',
       'management_trust', 'management_vwc',
       'management_water authority', 'management_water board',
       'management_wua', 'management_wug']].head()
df[['management_group_commercial',
    'management_group_other', 
    'management_group_parastatal', 
    'management_group_user-group']].head()
"""

In [None]:

# Columns ['payment', 'payment_type']
# No Nan values and 07 Unique categories in both columns
# Semantically same values in both columns
# After mapping(staderdization) choose one column i.e payment,
# One hot encoding is used for payment column
print(df['payment'].value_counts()) #
print('--------------------------------------------------------')
print(df['payment_type'].value_counts()) # Check if there are any null values in the column

# The values in the two columns semantically are almost same just differet wording
# Use same wording in both columns to avoid confusion

standard_map = {
    'annually': 'annually',
    'pay annually': 'annually',
    'monthly': 'monthly',
    'pay monthly': 'monthly',
    'per bucket': 'per bucket',
    'pay per bucket': 'per bucket',     # Define mapping to use same wording in both columns ['payment, 'payment_type'] to avoid confusion
    'on failure': 'on failure',
    'pay when scheme fails': 'on failure',
    'never pay': 'never pay',
    'unknown': 'unknown',
    'other': 'other'
}
df['payment'] = df['payment'].map(standard_map)
df['payment_type'] = df['payment_type'].map(standard_map)
print(df['payment'].value_counts()) #
print('--------------------------------------------------------')
print(df['payment_type'].value_counts()) # Check if there are any null values in the column
print('After mapping: mismatch count:should be zero',(df['payment'] != df['payment_type']).sum()) # Check if there are any null values in the column
print('Nan count:',df['payment'].isnull().sum()) # Check if there are any null values in the column
print('Unique values', df['payment'].nunique())# Check if there are any null values in the column
df=pd.get_dummies(df,columns=['payment'],prefix='payment',dtype=int) # Convert categorical variable into dummy/indicator variables
#print(df.columns)
df.drop(columns=['payment_unknown'],inplace=True) # Drop the first column to avoid dummy variable trap
df[['payment_annually', 'payment_monthly', 'payment_never pay',
       'payment_on failure', 'payment_other', 'payment_per bucket']].head(5) # Print the first 5 rows 

In [None]:
# Columns ['water_quality', 'quality_group']
# Based on the value_counts water_quality is the detailed version of quality_group
# No NaN values in both columns
# 8 categories in water_quality and 6 categories in quality_group
# I chose one hot encoding for water quality and decided to drop quality_group column
#print(df['water_quality'].head())
print('--------------------------------------------------------')
print(df['water_quality'].value_counts()) # Check if there are any null values in the column
print('Nan count in water quality:',df['water_quality'].isnull().sum()) # Check if there are any null values in the column
print('--------------------------------------------------------')
print(df['quality_group'].value_counts()) # Check if there are any null values in the column
print('Nan count in quality group:',df['water_quality'].isnull().sum()) # Check if there are any null values in the column
df=pd.get_dummies(df,columns=['water_quality'],prefix='wate_quality',dtype=int) # Convert categorical variable into dummy/indicator variables
df.drop(columns=['wate_quality_unknown'],inplace=True) # Drop the first column to avoid dummy variable trap
df.columns
#df[['water_quality','quality_group']].head()

df[['wate_quality_coloured', 'wate_quality_fluoride',
       'wate_quality_fluoride abandoned', 'wate_quality_milky',
       'wate_quality_salty', 'wate_quality_salty abandoned',
       'wate_quality_soft' ]].head()

In [None]:
# Columns ['quantity', 'quantity_group'] are same
# No Nan Values in both columns
# 5 categories in both columns
# I chose to keep quantity column and drop quantity_group column
# 1 hot encoding is used for quantity column

print(df['quantity'].head()) 
print(df['quantity'].value_counts()) # Check if there are any null values in the column
print('Nan count in quantity',df['quantity'].isnull().sum()) # Check if there are any null values in the column

print('---------------------------------------------------------')
print(df['quantity_group'].value_counts()) # Check if there are any null values in the column
print('Nan count in quantity group',df['quantity_group'].isnull().sum()) # Check if there are any null values in the column
print('---------------------------------------------------------')
df=pd.get_dummies(df,columns=['quantity'],prefix='quantity',dtype=int) # Convert categorical variable into dummy/indicator variables
df.drop(columns=['quantity_unknown'],inplace=True) # Drop the first column to avoid dummy variable trap
print(df[['quantity_dry', 'quantity_enough',
       'quantity_insufficient', 'quantity_seasonal']].head())


In [None]:
# Columns ['source', 'source_type']
# No NaN values in both columns
# 'source' is the detailed version of 'source_type'
# 10 categories in source and 7 categories in source_type
# I chose to keep source column and drop source_type column
# 1 hot encoding is used for source column
print(df['source'].head()) # Print the first 5 rows of the column
print('---------------------------------------------------------')
print(df['source'].value_counts()) # Check if there are any null values in the column
print('Nan count in source',df['source'].isnull().sum()) # Check if there are any null values in the column

print('---------------------------------------------------------')
print(df['source_type'].value_counts()) # Check if there are any null values in the column
print('Nan count in source type',df['source_type'].isnull().sum()) # Check if there are any null values in the column

print('---------------------------------------------------------')
df=pd.get_dummies(df,columns=['source'],prefix='source',dtype=int) # Convert categorical variable into dummy/indicator variables
df.drop(columns=['source_unknown'],inplace=True) # Drop the first column to avoid dummy variable trap
df[['source_dam', 'source_hand dtw', 'source_lake',
       'source_machine dbh', 'source_other', 'source_rainwater harvesting',
       'source_river', 'source_shallow well', 'source_spring']].head(5) # Print the first 5 rows of the column

In [None]:
# Column ['source_class']
# No Nan values in the column
# 3 categories in the column
# 1 hot encoding is used for source_class column
print(df['source_class'].head()) # Print the first 5 rows of the column
print('---------------------------------------------------------')
#print(pd.crosstab(df['source_class'],df['source_type'])) # Print the cross tabulation of source and source_type columns
print(df['source_class'].value_counts()) # Check if there are any null values in the column
print('Nan count in source class',df['source_class'].isnull().sum()) # Check if there are any null values in the column
print('---------------------------------------------------------')
df=pd.get_dummies(df,columns=['source_class'],prefix='source_class',dtype=int) # Convert categorical variable into dummy/indicator variables
df.drop(columns=['source_class_unknown'],inplace=True) # Drop the first column to avoid dummy variable trap
df[['source_class_groundwater', 'source_class_surface']].head() # Print the first 5 rows of the column

In [None]:
# Columns ['waterpoint_type', 'waterpoint_type_group'] have same info
# 'waterpoint_type' is the detailed version of 'waterpoint_type_group'
# No NaN values in both columns
# 7 categories in waterpoint_type and 6 categories in waterpoint_type_group
# I chose to keep waterpoint_type column and drop waterpoint_type_group column
# 1 hot encoding is used for waterpoint_type column
print(df['waterpoint_type'].head()) 
print('---------------------------------------------------------')
print(df['waterpoint_type'].value_counts()) # Check if there are any null values in the column
print('Nan count in waterpoint type',df['waterpoint_type'].isnull().sum()) # Check if there are any null values in the column

print('---------------------------------------------------------')
print(df['waterpoint_type_group'].value_counts()) # Check if there are any null values in the column
print('Nan count in waterpoint type group',df['waterpoint_type_group'].isnull().sum()) # Check if there are any null values in the column
print('---------------------------------------------------------')
df=pd.get_dummies(df, columns=['waterpoint_type'],prefix='waterpoint_type',dtype=int) # Convert categorical variable into dummy/indicator variables
df.drop(columns=['waterpoint_type_other'],inplace=True) # Drop the first column to avoid dummy variable trap
df[['waterpoint_type_cattle trough',
       'waterpoint_type_communal standpipe',
       'waterpoint_type_communal standpipe multiple', 'waterpoint_type_dam',
       'waterpoint_type_hand pump', 'waterpoint_type_improved spring']].head()

In [None]:
28522+6103