In [1]:
import pandas as pd

# Load the dataset
file_path = 'C:\\Users\\Mr Maziya\\Documents\\Dataset1.csv'
data = pd.read_csv(file_path)

In [2]:
# Get the size of the dataset
print(f"Dataset shape: {data.shape}")

Dataset shape: (45, 15)


In [3]:
# Display data types of each column
print(data.dtypes)

Settlement Name           object
Location                  object
Province                  object
Latitude                 float64
Longitude                float64
Date of Establishment     object
Initial Population         int64
Current Population        object
Sanitation                object
Water Supply              object
Electricity               object
Health Issues             object
Development Projects      object
Project Status            object
Funding Source            object
dtype: object


In [4]:
# Display column names
print(data.columns)

Index(['Settlement Name', 'Location', 'Province', 'Latitude', 'Longitude',
       'Date of Establishment', 'Initial Population', 'Current Population',
       'Sanitation', 'Water Supply', 'Electricity', 'Health Issues',
       'Development Projects', 'Project Status', 'Funding Source'],
      dtype='object')


In [5]:
# Get summary statistics
print(data.describe())

        Latitude  Longitude  Initial Population
count  45.000000  45.000000                45.0
mean  -29.574253  25.947387               100.0
std     3.539681   4.889897                 0.0
min   -34.131300  18.391100               100.0
25%   -33.926800  18.700000               100.0
50%   -29.836800  27.960700               100.0
75%   -26.106700  29.064300               100.0
max   -25.732200  31.045300               100.0


In [6]:
# Display a sample of rows to understand data slicing and indexing
print(data.iloc[:5, :5])  # Display first 5 rows and first 5 columns

                 Settlement Name      Location      Province  Latitude  \
0                    Khayelitsha     Cape Town  Western Cape  -34.0367   
1                      Diepsloot  Johannesburg       Gauteng  -25.9399   
2                      Alexandra  Johannesburg       Gauteng  -26.1067   
3                 Joe Slovo Park     Cape Town  Western Cape  -33.9168   
4  Barberton Informal Settlement     Barberton    Mpumalanga  -25.7868   

   Longitude  
0    18.6766  
1    27.9607  
2    28.1049  
3    18.5216  
4    31.0453  


In [7]:
# Check for missing values
missing_data = data.isnull().sum()
print(missing_data)

# Display columns with missing values
missing_columns = missing_data[missing_data > 0]
print(missing_columns)

Settlement Name           0
Location                  0
Province                  0
Latitude                  0
Longitude                 0
Date of Establishment     0
Initial Population        0
Current Population        0
Sanitation               42
Water Supply             37
Electricity              40
Health Issues             0
Development Projects      1
Project Status            1
Funding Source            1
dtype: int64
Sanitation              42
Water Supply            37
Electricity             40
Development Projects     1
Project Status           1
Funding Source           1
dtype: int64


In [8]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for normalization
numeric_columns = ['Latitude', 'Longitude']

# Check if these columns exist in the dataset
numeric_columns = [col for col in numeric_columns if col in data.columns]
print(f"Numeric columns to normalize: {numeric_columns}")

# Standardize the selected columns
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Display the normalized data
print(data[numeric_columns].head())

Numeric columns to normalize: ['Latitude', 'Longitude']
   Latitude  Longitude
0 -1.274937  -1.503702
1  1.038348   0.416382
2  0.990693   0.446204
3 -1.240681  -1.535758
4  1.082089   1.054321


### Data Types in Relation to Machine Learning

In the context of machine learning, the data types in our dataset can be categorized as follows:

- **Numerical Data**:
  - Columns like `Latitude` and `Longitude` are numerical and continuous. These are important for statistical analysis and for feeding into machine learning models.
  - Example column: `Latitude` is standardized to have a mean of 0 and standard deviation of 1. This helps in improving the performance and training stability of many machine learning algorithms.

Normalization or standardization is crucial for numerical data to ensure that each feature contributes equally to the model, preventing bias toward features with larger ranges.
