In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
file_path = 'datasets/master.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
data_info = data.info()
data_head = data.head()

# Check for duplicates, this adds a new column to the dataset
data["is_duplicate"] = data.duplicated()

# Note that when using f-strings, the internal quote character must be different, such as 'is_duplicate' above
print(f"#total= {len(data)}")
print(f"#duplicated= {len(data[data['is_duplicate']==True])}")

# Check for missing values
print("Missing values:", data.isnull().sum())

# Drop rows with missing values
data.dropna(inplace=True)

print(f'N rows={len(data)} M columns={len(data.columns)}')
print(data.dtypes)

# Dropping the 'HDI for year' Column
data.drop(columns=['HDI for year', 'suicides_no', ' gdp_for_year ($) ', 'country-year', 'is_duplicate'], inplace=True)

# One-Hot Encoding for Categorical Variables
categorical_cols = ['country', 'sex', 'age', 'generation']
data = pd.get_dummies(data, columns=categorical_cols)
    
# Normalization/Standardization
scaler = StandardScaler()
numerical_cols = ['year', 'population', 'suicides/100k pop', 'gdp_per_capita ($)']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             27820 non-null  object 
 1   year                27820 non-null  int64  
 2   sex                 27820 non-null  object 
 3   age                 27820 non-null  object 
 4   suicides_no         27820 non-null  int64  
 5   population          27820 non-null  int64  
 6   suicides/100k pop   27820 non-null  float64
 7   country-year        27820 non-null  object 
 8   HDI for year        8364 non-null   float64
 9    gdp_for_year ($)   27820 non-null  object 
 10  gdp_per_capita ($)  27820 non-null  int64  
 11  generation          27820 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB
#total= 27820
#duplicated= 0
Missing values: country                   0
year                      0
sex                       0
age           

In [2]:
# Question 1

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Split the dataset into features (X) and target variable (y)
X = data.drop(columns=['suicides/100k pop']) 
y = data['suicides/100k pop']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Print the number of regression coefficients
num_coefficients = len(model.coef_)
print("Number of regression coefficients:", num_coefficients)


Number of regression coefficients: 107


In [3]:
# Question 2

from sklearn.metrics import mean_absolute_error

# Filter the original DataFrame based on the specified conditions
subset_data = data[(data['age_15-24 years'] == 1) & 
                   (data['sex_male'] == 1) &
                   (data['generation_Generation X'] == 1)]

# Define the input data for prediction
input_data = subset_data.drop(columns=['suicides/100k pop'])  # Remove the target variable

# Use the trained model to make predictions
prediction = model.predict(input_data)

# Calculate the Mean Absolute Error (MAE)
true_values = subset_data['suicides/100k pop']  # Actual values from the subset
mae = mean_absolute_error(true_values, prediction)
print("MAE:", mae)


MAE: 0.3723764400203052


In [4]:
# Question 3 

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
file_path = 'datasets/master.csv'
data1 = pd.read_csv(file_path)

# Check for duplicates, this adds a new column to the dataset
data1["is_duplicate"]= data1.duplicated()

# Dropping redundant columns
data1.drop(columns=['HDI for year', 'suicides_no', ' gdp_for_year ($) ', 'country-year', 'is_duplicate'], inplace=True)

# Handling Categorical Variables
# For 'country' and 'sex', continue using Label Encoding
label_encoder = LabelEncoder()
data1['country'] = label_encoder.fit_transform(data1['country'])
data1['sex'] = label_encoder.fit_transform(data1['sex'])

# For 'generation', use a mapping to preserve natural order
generation_order = {'G.I. Generation': 1, 'Silent': 2, 'Boomers': 3, 'Generation X': 4, 'Millennials': 5, 'Generation Z': 6}
data1['generation'] = data1['generation'].map(generation_order)

# For 'age', sort numerically (you might need to adjust this depending on how age is formatted in your dataset)
age_order = {'5-14 years': 1, '15-24 years': 2, '25-34 years': 3, '35-54 years': 4, '55-74 years': 5, '75+ years': 6}
data1['age'] = data1['age'].map(age_order)

# Checking for missing values in each column
missing_values = data1.isnull().sum()

# Display the missing values
print(missing_values)

# Find the most frequent value (mode) in the 'generation' column
mode_generation = data1['generation'].mode()[0]

# Fill missing values in 'generation' with the mode
data1['generation'].fillna(mode_generation, inplace=True)

# Verify if there are any missing values left in 'generation'
missing_values = data1['generation'].isnull().sum()
print("Missing values in 'generation' after imputation:", missing_values)

# Normalization/Standardization of Numerical Variables
scaler = StandardScaler()
numerical_cols = ['year', 'population', 'gdp_per_capita ($)']
data1[numerical_cols] = scaler.fit_transform(data1[numerical_cols])

# Check the processed data
print(data1.head())

country                  0
year                     0
sex                      0
age                      0
population               0
suicides/100k pop        0
gdp_per_capita ($)       0
generation            5844
dtype: int64
Missing values in 'generation' after imputation: 0
   country      year  sex  age  population  suicides/100k pop  \
0        0 -1.683615    1    2   -0.391617               6.71   
1        0 -1.683615    1    4   -0.392870               5.19   
2        0 -1.683615    0    2   -0.397548               4.83   
3        0 -1.683615    1    6   -0.466035               4.59   
4        0 -1.683615    1    3   -0.401485               3.28   

   gdp_per_capita ($)  generation  
0           -0.850864         4.0  
1           -0.850864         2.0  
2           -0.850864         4.0  
3           -0.850864         1.0  
4           -0.850864         3.0  


In [5]:
# Question 3

# Build a new model using the original numerical form of 'sex', 'age', and 'generation'
X = data1[['year', 'sex', 'age', 'generation', 'population', 'gdp_per_capita ($)']]
y = data1['suicides/100k pop']

model = LinearRegression()
model.fit(X, y)

# Number of line coefficients
num_coefficients = len(model.coef_)
print("Number of line coefficients:", num_coefficients)

Number of line coefficients: 6


In [6]:
# Question 4

# Prepare the input data for prediction
input_data = pd.DataFrame({
    'year': [0],  
    'sex': [1],   # Male sex
    'age': [2],   # Age 20
    'generation': [4],  # Generation X
    'population': 0,     
    'gdp_per_capita ($)': 0  
})

# Use the model to make predictions
prediction = model.predict(input_data)

# Print the prediction
print("Predicted target value:", prediction)

# Calculate the Mean Absolute Error (MAE)
true_value = data1.loc[(data1['age'] == 2) & (data1['sex'] == 1) & (data1['generation'] == 4), 'suicides/100k pop'].values[0]
mae = mean_absolute_error([true_value], prediction)
print("MAE:", mae)


Predicted target value: [14.0331118]
MAE: 7.323111796377767


**Question 5**

**Model from Question 1**
- **Approach**: Used a preprocessed dataset where 'sex', 'age', and 'generation' were one-hot encoded.
- **Regression Coefficients**: The model had 107 coefficients, indicating a high dimensionality due to one-hot encoding.
- **Prediction and MAE for Specific Case (Age 20, Male, Generation X)**: Calculated a MAE of 0.3723764400203052. This value is specific to the subset of data representing 20-year-old males from Generation X.

**Model from Question 3**
- **Approach**: Used the original form of 'sex', 'age', and 'generation' variables, which were then feature-engineered into numerical features.
- **Regression Coefficients**: The model had 6 coefficients, indicating lower dimensionality compared to the one-hot encoded model.
- **Prediction and MAE for Specific Case (Age 20, Male, Generation X)**: The prediction was made for the same demographic group as in question 1, and the MAE was 7.323111796377767.

**Comparative Analysis**
1. **Complexity**: The first model is more complex due to the higher number of coefficients resulting from one-hot encoding. The second model is simpler with fewer coefficients, indicating lower complexity.

2. **Interpretability**: The second model might be easier to interpret due to the fewer number of coefficients and direct numerical transformation of 'age' and 'generation'.

3. **Performance**: Based on the MAE values, the first model (one-hot encoded) shows a lower error rate for the specific case of predicting the suicide rate for 20-year-old males from Generation X. However, it's important to note that MAE values are highly context-specific.

4. **Generalization**: Lower complexity models (like the second model) often generalize better to new data.

In [7]:
# Question 6

# Prepare the input data for prediction
input_data = pd.DataFrame({
    'year': [0],  
    'sex': [1],   # Male sex
    'age': [3],   # Age 33
    'generation': [7],  # Generation Alpha
    'population': [0],     
    'gdp_per_capita ($)': [0]  
})

# Use the model to make predictions
prediction = model.predict(input_data)

# Print the prediction
print("Predicted target value:", prediction)


Predicted target value: [21.82531268]


**Question 7**

One significant advantage of using regression (as opposed to classification with nominal features) in the context of independent variables is its ability to capture and utilize continuous and ordinal relationships among these variables. This characteristic is particularly beneficial when dealing with real-world datasets where variables often exist on a continuum or have a natural order, and their precise values carry meaningful and quantifiable information. Here's a more detailed breakdown of this advantage:

**Capturing Continuous Relationships:**
- **Precision and Nuance**: Regression models can interpret and use the exact values of continuous independent variables, allowing for a more nuanced and precise understanding of their impact on the dependent variable. This is particularly valuable in cases where slight changes in an independent variable could lead to significant changes in the outcome.
  
- **Interpretability of Coefficients**: In regression, the coefficients of continuous independent variables directly represent the change in the dependent variable for a unit change in the predictor. This interpretability is crucial in many fields like economics, medicine, and social sciences, where understanding the strength and direction of relationships is essential.

**Utilizing Ordinal Relationships:**
- **Ordinal Data Handling**: When dealing with ordinal data (data with a natural order, like 'low', 'medium', 'high'), regression can capture the intrinsic ordering in these variables, which is often lost in classification tasks. This can lead to more accurate and meaningful models, especially in cases where the order itself has an impact on the outcome.

**Summary:**
In summary, regression offers a robust framework for incorporating the exact or ordered values of independent variables into the model, leading to potentially more accurate predictions and richer insights, especially in scenarios where the subtleties and nuances in variable values play a crucial role in determining the outcome.

**Question 8**

One significant advantage of using regular numerical values rather than one-hot encoding for regression is related to model simplicity and dimensionality. When regular numerical values (especially for ordinal categorical variables) are used instead of one-hot encoding, it often leads to simpler models with fewer input features, which can offer several benefits:

**Reduced Model Complexity and Lower Dimensionality:**
- **Avoiding the Curse of Dimensionality**: One-hot encoding transforms categorical variables into multiple binary variables, one for each category. This can significantly increase the number of features in the dataset, leading to high-dimensional models. Using numerical values keeps the feature space smaller, helping to avoid the "curse of dimensionality" which can hamper model performance, particularly in terms of overfitting and computational efficiency.
  
- **Easier Interpretation and Analysis**: Models with fewer features are generally easier to interpret and analyze. Each coefficient in a regression model directly corresponds to one feature, so fewer features mean a more straightforward interpretation of how each variable affects the outcome.
  
- **Improved Computational Efficiency**: With fewer features, models often require less computational resources for training and prediction, making them more efficient to run. This can be especially beneficial when dealing with large datasets or when computational resources are limited.

- **Handling Ordinal Data Appropriately**: If the categorical variable is ordinal (where the order of categories is meaningful), encoding it as a regular numerical value preserves this order, which can lead to a more accurate model. One-hot encoding, on the other hand, treats each category as separate and equal, losing any ordinal information.

**Summary:**
In essence, using regular numerical values can simplify the model, making it more interpretable, computationally efficient, and potentially more accurate (especially in the case of ordinal data) compared to the complex, high-dimensional models that can result from one-hot encoding. 

**Question 9**

I would suggest using a regression model to the machine learning model customer. Here’s a more detailed justification:

1. **Nature of Target Variable**: The target variable in the problem is a continuous quantity, representing a rate. Regression is well-suited for predicting continuous outcomes and can provide precise, quantifiable predictions, which is essential for tasks like estimating suicide rates.

2. **Interpretability**: Regression models offer valuable interpretability, especially in terms of understanding how changes in independent variables like age, sex, and generation influence the suicide rate. Each coefficient in a regression model represents the expected change in the suicide rate for a unit change in a predictor, assuming other variables are held constant. This can provide insightful and actionable information.

3. **Subtlety and Nuance in Predictions**: Regression can capture the subtleties and nuances in the data, offering a detailed prediction. This is important in cases like yours where small changes in predictors could significantly influence the outcome.

4. **Model Performance and Suitability**: Based on the tasks in the assignment, the regression model has been tailored to predict a specific rate, aligning closely with the continuous nature of your target variable. This direct alignment often results in better performance for the given task.

In conclusion, while classification could categorize data into discrete classes (like high or low suicide risk), regression provides a more fitting approach for your task of predicting an exact numerical value. Its ability to provide detailed, continuous outputs and the interpretability of its results makes it the recommended choice for your machine learning customer in this scenario.