In [133]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

# Fetch dataset
infrared_thermography_temperature = fetch_ucirepo(id=925)

# Convert data to pandas DataFrame
X = pd.DataFrame(infrared_thermography_temperature.data.features)
y = pd.DataFrame(infrared_thermography_temperature.data.targets)
                


# Display all column names in the DataFrame
print(X.columns)
print(y.columns)


# Display the first few rows of the DataFrame
print(X.head())
print(y.head())


Index(['Gender', 'Age', 'Ethnicity', 'T_atm', 'Humidity', 'Distance',
       'T_offset1', 'Max1R13_1', 'Max1L13_1', 'aveAllR13_1', 'aveAllL13_1',
       'T_RC1', 'T_RC_Dry1', 'T_RC_Wet1', 'T_RC_Max1', 'T_LC1', 'T_LC_Dry1',
       'T_LC_Wet1', 'T_LC_Max1', 'RCC1', 'LCC1', 'canthiMax1', 'canthi4Max1',
       'T_FHCC1', 'T_FHRC1', 'T_FHLC1', 'T_FHBC1', 'T_FHTC1', 'T_FH_Max1',
       'T_FHC_Max1', 'T_Max1', 'T_OR1', 'T_OR_Max1'],
      dtype='object')
Index(['aveOralF', 'aveOralM'], dtype='object')
   Gender    Age                  Ethnicity  T_atm  Humidity  Distance  \
0    Male  41-50                      White   24.0      28.0       0.8   
1  Female  31-40  Black or African-American   24.0      26.0       0.8   
2  Female  21-30                      White   24.0      26.0       0.8   
3  Female  21-30  Black or African-American   24.0      27.0       0.8   
4    Male  18-20                      White   24.0      27.0       0.8   

   T_offset1  Max1R13_1  Max1L13_1  aveAllR13_1  ...  T

In [134]:
# Check for missing values in the features DataFrame (X)
missing_values_features = X.isnull()

# Count missing values in each feature column
missing_counts_features = X.isnull().sum()

# Display the counts of missing values in each feature column
print("\nCounts of missing values in each feature column:")
print(missing_counts_features)

# Check for missing values in the target DataFrame (y)
missing_values_target = y.isnull()

# Count missing values in the target column
missing_counts_target = y.isnull().sum()

# Display the counts of missing values in the target column
print("\nCounts of missing values in the target column:")
print(missing_counts_target)



Counts of missing values in each feature column:
Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       2
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
dtype: int64

Counts of missing values in the target column:
aveOralF    0
aveOralM    0
dtype: int64


In [135]:
# Drop rows with missing values in the features DataFrame (X)
X_cleaned = X.dropna()

# Drop rows with missing values in the target DataFrame (y)
y_cleaned = y.dropna()

# Ensure that both DataFrames have the same indices
# Align indices before dropping rows
common_indices = X_cleaned.index.intersection(y_cleaned.index)
X_cleaned = X_cleaned.loc[common_indices]
y_cleaned = y_cleaned.loc[common_indices]


In [136]:
# Check for missing values in the features DataFrame (X_cleaned)
missing_values_features_cleaned = X_cleaned.isnull()

# Count missing values in each feature column after cleaning
missing_counts_features_cleaned = X_cleaned.isnull().sum()

# Display the counts of missing values in each feature column after cleaning
print("\nCounts of missing values in each feature column after cleaning:")
print(missing_counts_features_cleaned)

# Check for missing values in the target DataFrame (y_cleaned)
missing_values_target_cleaned = y_cleaned.isnull()

# Count missing values in the target column after cleaning
missing_counts_target_cleaned = y_cleaned.isnull().sum()

# Display the counts of missing values in the target column after cleaning
print("\nCounts of missing values in the target column after cleaning:")
print(missing_counts_target_cleaned)



Counts of missing values in each feature column after cleaning:
Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       0
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
dtype: int64

Counts of missing values in the target column after cleaning:
aveOralF    0
aveOralM    0
dtype: int64


In [137]:
age_groups = X["Age"].unique()
print("age_groups : ", age_groups)

age_groups :  ['41-50' '31-40' '21-30' '18-20' '26-30' '21-25' '>60' '51-60']


In [138]:
# Apply one-hot encoding to the 'Age' column
X_encoded = pd.get_dummies(X, columns=['Age'], drop_first=True)

# Display the resulting DataFrame
print(X_encoded)

      Gender                  Ethnicity  T_atm  Humidity  Distance  T_offset1  \
0       Male                      White   24.0      28.0       0.8     0.7025   
1     Female  Black or African-American   24.0      26.0       0.8     0.7800   
2     Female                      White   24.0      26.0       0.8     0.8625   
3     Female  Black or African-American   24.0      27.0       0.8     0.9300   
4       Male                      White   24.0      27.0       0.8     0.8950   
...      ...                        ...    ...       ...       ...        ...   
1015  Female                      Asian   25.7      50.8       0.6     1.2225   
1016  Female                      White   25.7      50.8       0.6     1.4675   
1017  Female  Black or African-American   28.0      24.3       0.6     0.1300   
1018    Male            Hispanic/Latino   25.0      39.8       0.6     1.2450   
1019  Female                      White   23.8      45.6       0.6     0.8675   

      Max1R13_1  Max1L13_1 

In [139]:


# Function to convert age range
def convert_age_range(age_range):
    """Converts the age range to a single average value"""
    if isinstance(age_range, str):
        if '>' in age_range:
            return int(age_range.replace('>', '').strip())
        lower, upper = map(int, age_range.split('-'))
        return np.mean([lower, upper])
    return age_range  # Return the value as is if it's not a string

# Apply the function to the 'Age' column
X['Age'] = X['Age'].apply(convert_age_range)

# Check unique values in the 'Gender' column
print(X['Gender'].unique())

# Convert 'Gender' column to numeric
X['Gender'] = X['Gender'].map({'Male': 1, 'Female': 0})


# Display the updated DataFrame
print(X)




['Male' 'Female']
      Gender   Age                  Ethnicity  T_atm  Humidity  Distance  \
0          1  45.5                      White   24.0      28.0       0.8   
1          0  35.5  Black or African-American   24.0      26.0       0.8   
2          0  25.5                      White   24.0      26.0       0.8   
3          0  25.5  Black or African-American   24.0      27.0       0.8   
4          1  19.0                      White   24.0      27.0       0.8   
...      ...   ...                        ...    ...       ...       ...   
1015       0  23.0                      Asian   25.7      50.8       0.6   
1016       0  23.0                      White   25.7      50.8       0.6   
1017       0  19.0  Black or African-American   28.0      24.3       0.6   
1018       1  28.0            Hispanic/Latino   25.0      39.8       0.6   
1019       0  19.0                      White   23.8      45.6       0.6   

      T_offset1  Max1R13_1  Max1L13_1  aveAllR13_1  ...  T_FHCC1  T_F

In [140]:
# Select the relevant columns
selected_columns_X = ['Age', 'Gender', 'T_atm', 'Humidity', 'T_offset1']
selected_columns_y = ['aveOralM']

# Get the statistical summary
statistical_details_X = X[selected_columns_X].describe()
statistical_details_y = y[selected_columns_y].describe()


# Display the statistical details
print(statistical_details_X)
print(statistical_details_y)


               Age       Gender        T_atm     Humidity    T_offset1
count  1020.000000  1020.000000  1020.000000  1020.000000  1020.000000
mean     22.296569     0.405882    24.115392    28.723039     0.968648
std       5.852500     0.491303     1.336338    13.071627     0.362587
min      19.000000     0.000000    20.200000     9.900000    -0.590000
25%      19.000000     0.000000    23.400000    17.600000     0.772500
50%      19.000000     0.000000    24.000000    26.300000     0.940000
75%      23.000000     1.000000    24.700000    36.200000     1.140000
max      60.000000     1.000000    29.100000    61.200000     2.875000
          aveOralM
count  1020.000000
mean     37.028382
std       0.509502
min      35.540000
25%      36.777500
50%      36.940000
75%      37.140000
max      40.340000


In [141]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [142]:

# Assuming X_new and y_new are already defined with the selected columns
X_new = X[['Age', 'Gender', 'T_atm', 'Humidity', 'T_offset1']]
y_new = y['aveOralM']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


# Estimate the coefficients corresponding to the independent variables
coefficients = model.coef_

# Create a DataFrame to display the coefficients alongside the feature names
coefficients_df = pd.DataFrame({
    'Feature': X_new.columns,
    'Coefficient': coefficients
})

# Display the estimated intercept & coefficients 
print('Intercept: \n', model.intercept_)
print(coefficients_df)


Intercept: 
 36.62796301934947
     Feature  Coefficient
0        Age    -0.008195
1     Gender     0.066194
2      T_atm     0.015053
3   Humidity     0.001640
4  T_offset1     0.148206


In [143]:

# Select the independent features and the dependent feature
independent_features = ['T_OR1', 'T_OR_Max1', 'T_FHC_Max1', 'T_FH_Max1']
X1 = X[independent_features]
y1 = y['aveOralM']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Estimate the coefficients corresponding to the independent variables
coefficients = model.coef_

# Create a DataFrame to display the coefficients alongside the feature names
coefficients_df = pd.DataFrame({
    'Feature': independent_features,
    'Coefficient': coefficients
})

# Display the estimated coefficients
print('Intercept: \n', model.intercept_)
print(coefficients_df)

Intercept: 
 7.036879763545965
      Feature  Coefficient
0       T_OR1     0.091997
1   T_OR_Max1     0.464070
2  T_FHC_Max1    -0.087332
3   T_FH_Max1     0.370886


In [144]:

# Make predictions
y_pred = model.predict(X_test)

# Calculate Residual Sum of Squares (RSS)
residual_sum_of_squares = np.sum((y_pred - y_test) ** 2)

# Calculate Residual Standard Error (RSE)
n = X_test.shape[0]
d = X_test.shape[1]
residual_standard_error = np.sqrt(residual_sum_of_squares / (n - d - 1))

# Calculate Mean Squared Error (MSE)
mean_squared_error_value = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R²)
r_squared = model.score(X_test, y_test)

# Prepare data for OLS model to get additional statistics
X_test_with_intercept = np.c_[np.ones(X_test.shape[0]), X_test]  # Add constant term
ols_model = sm.OLS(y_test, X_test_with_intercept).fit()

# Extract statistics from the OLS model
standard_errors = ols_model.bse[1:]  # Exclude the intercept term
t_values = ols_model.tvalues[1:]  # Exclude the intercept term
p_values = ols_model.pvalues[1:]  # Exclude the intercept term

# Display results
print(f"Residual Sum of Squares (RSS): {residual_sum_of_squares}")
print(f"Residual Standard Error (RSE): {residual_standard_error}")
print(f"Mean Squared Error (MSE): {mean_squared_error_value}")
print(f"R-squared (R²): {r_squared}")

# Create a DataFrame to display feature statistics
results_df = pd.DataFrame({
    'Feature': independent_features,
    'Coefficient': model.coef_,
    'Standard Error': standard_errors,
    't-Statistic': t_values,
    'p-Value': p_values
})

print("\nFeature Statistics:\n", results_df)


Residual Sum of Squares (RSS): 15.170504359408241
Residual Standard Error (RSE): 0.2761044915394942
Mean Squared Error (MSE): 0.07436521744807961
R-squared (R²): 0.646842080055587

Feature Statistics:
        Feature  Coefficient  Standard Error  t-Statistic   p-Value
x1       T_OR1     0.091997        1.624156     0.828232  0.408531
x2   T_OR_Max1     0.464070        1.619382    -0.507885  0.612096
x3  T_FHC_Max1    -0.087332        0.081987     0.534191  0.593806
x4   T_FH_Max1     0.370886        0.090514     2.284496  0.023397
