In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox, yeojohnson
import statsmodels.regression.linear_model as srl
import statsmodels.formula.api as smf

In [54]:
def load_data():
        '''
        This function loads the data from a csv file.
        
        Returns:
            df (DataFrame): The loaded data.
        '''
        df = pd.read_csv('failure_data.csv')
        
        df['Type'] = df['Type'].astype('category')
        without_nulls = df.dropna(subset=('Air temperature [K]', 'Process temperature [K]'))
        process_temp_gradient, process_temp_intercept = np.polyfit(without_nulls['Air temperature [K]'], without_nulls['Process temperature [K]'], deg=1)
        air_temp_gradient, air_temp_intercept = np.polyfit(without_nulls['Process temperature [K]'], without_nulls['Air temperature [K]'], deg=1)
        process_temp_nulls = df[df['Air temperature [K]'].notnull() & df['Process temperature [K]'].isnull()]
        process_temp_nulls['Process temperature [K]'] = np.round(process_temp_gradient * process_temp_nulls['Air temperature [K]'] + process_temp_intercept, 1)
        air_temp_nulls = df[df['Air temperature [K]'].isnull() & df['Process temperature [K]'].notnull()]
        air_temp_nulls['Air temperature [K]'] = np.round(air_temp_gradient * air_temp_nulls['Process temperature [K]'] + air_temp_intercept, 1)
        df.update(air_temp_nulls)
        df.update(process_temp_nulls)
        df = df.dropna(subset=('Air temperature [K]', 'Process temperature [K]'))
        df['Tool wear [min]'] = df['Tool wear [min]'].fillna(df['Tool wear [min]'].mean())

        df['Rotational speed [rpm]'] = boxcox(df['Rotational speed [rpm]'])[0]
        df['Machine failure'] = yeojohnson(df['Machine failure'] + 0.1)[0]
        df['TWF'] = yeojohnson(df['TWF'] + 0.1)[0]
        df['HDF'] = yeojohnson(df['HDF'] + 0.1)[0]
        df['PWF'] = yeojohnson(df['PWF'] + 0.1)[0]
        df['OSF'] = yeojohnson(df['OSF'] + 0.1)[0]
        df['RNF'] = yeojohnson(df['RNF'] + 0.1)[0]
        print(process_temp_gradient)
        print(air_temp_gradient)
        return df

df = load_data()


0.6504895742994061
1.1764186050600978


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  process_temp_nulls['Process temperature [K]'] = np.round(process_temp_gradient * process_temp_nulls['Air temperature [K]'] + process_temp_intercept, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  air_temp_nulls['Air temperature [K]'] = np.round(air_temp_gradient * air_temp_nulls['Process temperature [K]'] + air_temp_intercept, 1)


In [5]:
import plotly.express as px
px.imshow(df.corr(), title="Correlation heatmap of dataframe")

  px.imshow(df.corr(), title="Correlation heatmap of dataframe")


In [117]:
df.drop('Rotational_speed_rpm', axis=1, inplace=True)
df.head()

Unnamed: 0,UDI,Product_ID,Type,Air_temperature_K,Process_temperature_K,Torque_Nm,Tool_wear_min,Machine_failure,TWF,HDF,PWF,OSF,RNF
0,3416.0,L50595,L,301.4,310.4,36.9,133.0,0.021618,0.003188,0.00743,0.006127,0.006322,0.003067
1,7130.0,L54309,L,300.6,310.0,31.0,107.0,0.021618,0.003188,0.00743,0.006127,0.006322,0.003067
2,2320.0,M17179,M,299.2,308.8,33.5,185.0,0.021618,0.003188,0.00743,0.006127,0.006322,0.003067
3,9601.0,M24460,M,298.9,310.0,45.3,58.0,0.021618,0.003188,0.00743,0.006127,0.006322,0.003067
4,614.0,L47793,L,298.1,309.9,30.0,53.0,0.021618,0.003188,0.00743,0.006127,0.006322,0.003067


In [113]:
#df.columns = df.columns.str.replace(' ', '_')
#df.columns = df.columns.str.replace('[', '')
#df.columns = df.columns.str.replace(']', '')
model0 = smf.ols('Air_temperature_K ~ Process_temperature_K + Rotational_speed_rpm + Torque_Nm + Tool_wear_min', df).fit()
model1 = smf.ols('Process_temperature_K ~ Air_temperature_K + Rotational_speed_rpm + Torque_Nm + Tool_wear_min', df).fit()
model2 = smf.ols('Rotational_speed_rpm ~ Process_temperature_K + Air_temperature_K + Torque_Nm + Tool_wear_min', df).fit()
model3 = smf.ols('Torque_Nm ~ Process_temperature_K + Air_temperature_K + Tool_wear_min', df).fit()
model4 = smf.ols('Tool_wear_min ~ Process_temperature_K + Air_temperature_K + Rotational_speed_rpm + Torque_Nm', df).fit()
print(model0.rsquared, model1.rsquared, model2.rsquared, model3.rsquared, model4.rsquared)


0.7951800694745133 0.7951421511798512 0.8427644280834368 0.00020160180821748863 0.00038352779287309513
