Importing the neccessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report,roc_curve,roc_auc_score
from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

!pip install xgboost
from xgboost import XGBClassifier

plt.rcParams['figure.figsize']=(15,10)
plt.rcParams['figure.dpi']=300
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load The DataSet
df=pd.read_csv("/content/sensor.csv")
df

In [None]:
df.describe()

In [None]:
df.describe(include=object)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.drop("Unnamed: 0",inplace=True,axis=1)


In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

The column "sensor_15" has only null values so we dropped tye column


In [None]:
df.drop("sensor_15",inplace=True, axis=1)

In [None]:
sns.heatmap(df.isnull(),cmap="viridis")
plt.tight_layout()

in the dataset most of the columns contains missing values

to replace the missing values check for outliers

Since the columns are numerical

if outliers are present replace the missing values with the median
otherwise replace the missing values with the mean

In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

In [None]:
df.hist()
plt.tight_layout()

In [None]:
for i in df.isna().sum():
  print(i/len(df)*100)

In [None]:
# prompt: replace missing values with the median for all numerical columns

for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)


In [None]:
 df.isnull().sum()

There are 3 categorical columns in the Data set, Convert them into numerical

In [None]:
df["machine_status"].value_counts()

In [None]:
le=LabelEncoder()
df["machine_status"]=le.fit_transform(df["machine_status"])

In [None]:
df["machine_status"].value_counts()

In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

# to treat the outliers
capping replacing outlier values is called capping
In capping all outlier values will be replaced by upper extreame or lower extreame
Outlier detection : user defined function to calculate upper extreame and lower extreame

In [None]:


def calculate_extremes(df):
    for col in df.select_dtypes(include=np.number).columns:
        q1 = df[col].quantile(0.25)
        q2 = df[col].quantile(0.5)  # Quartile 2 (Median)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_extreame = q3 + (1.5 * iqr)
        lower_extreame = q1 - (1.5 * iqr)

        print(f"Column: {col}")
        print(f"Quartile 1 (Q1): {q1}")
        print(f"Quartile 2 (Q2/Median): {q2}")
        print(f"Interquartile Range (IQR): {iqr}")
        print(f"Upper Extreme: {upper_extreame}")
        print(f"Lower Extreme: {lower_extreame}")
        print("-" * 20)

calculate_extremes(df)


In [None]:

def treat_outliers(df):
    for col in df.select_dtypes(include=np.number).columns:
        if col != 'machine_status':  # Exclude 'machine_status' column
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            upper_extreme = q3 + (1.5 * iqr)
            lower_extreme = q1 - (1.5 * iqr)

            df[col] = np.where(df[col] > upper_extreme, upper_extreme, df[col])
            df[col] = np.where(df[col] < lower_extreme, lower_extreme, df[col])
    return df

df = treat_outliers(df)


In [None]:
df

In [None]:
df.describe()

In [None]:
df["machine_status"].value_counts()

In [None]:
sns.countplot(x='machine_status', data=df,palette="viridis")
plt.title('Machine Status')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

In [None]:
df.hist(color="yellow",edgecolor="black")
plt.tight_layout()

In [None]:


# Create the plot.
for column in df.columns:
    if column != 'machine_status':
        plt.figure(figsize=(10, 6))
        plt.plot(df.index, df[column], label=column)
        plt.xlabel('Time')
        plt.ylabel('Sensor Reading')
        plt.title(f'Sensor {column} Readings Over Time')
        plt.legend()
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True,cmap="RdYlGn")
plt.tight_layout()

In [None]:
x = df.drop(["machine_status", "timestamp"], axis=1)
y = df["machine_status"]

In [None]:
x

In [None]:
y

In [None]:

sc=StandardScaler()
x1=sc.fit_transform(x)

In [None]:
x=pd.DataFrame(x1,columns=x.columns)
x

In [None]:
x.describe()

# **Model Building**

In [None]:
# splitting the dataset into training(77%) and testing(33%)data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

## **1.Logistic Regression**

In [None]:
model1=LogisticRegression()
model1.fit(x_train,y_train)

In [None]:
y_predict1=model1.predict(x_test)
y_predict1

In [None]:
df1=pd.DataFrame({"Actual":y_test,"predicted":y_predict1})
df1

In [None]:
conf_matrix=confusion_matrix(y_test,y_predict1)
print(conf_matrix)

In [None]:
print("training accuracy:", model1.score(x_train,y_train))
print("testing accuracy:", model1.score(x_test,y_test))

In [None]:
class_report1=classification_report(y_test,y_predict1)
print(class_report1)

## **2. Decision Tree Classifier**

In [None]:
model2=DecisionTreeClassifier()
model2.fit(x_train,y_train)

In [None]:
x.columns

In [None]:
y.unique()

In [None]:
fn=x.columns
cn=['2','1','0']
tree.plot_tree(model2,feature_names=fn,class_names=cn,filled=True)
plt.tight_layout()

In [None]:
y_predict2=model2.predict(x_test)
y_predict2

In [None]:

y_test.values

In [None]:
print("train_accuracy:",model2.score(x_train,y_train))
print("test_accuracy:",model2.score(x_test,y_test))

In [None]:
model2=DecisionTreeClassifier(criterion="gini",max_depth=3,min_samples_split=8,random_state=42)
model2.fit(x_train,y_train)

In [None]:
fn=x.columns
cn=['2','1','0']
tree.plot_tree(model2,feature_names=fn,class_names=cn,filled=True)
plt.tight_layout()

In [None]:
y_predict2=model2.predict(x_test)
y_predict2

In [None]:
y_test.values

In [None]:
print("train_accuracy:",model2.score(x_train,y_train))
print("test_accuracy:",model2.score(x_test,y_test))

In [None]:
class_report2=classification_report(y_test,y_predict2)
print(class_report2)

## **3. Random forest Classifier**

In [None]:
model3=RandomForestClassifier(random_state=2)
model3.fit(x_train,y_train)

In [None]:
y_predict3=model3.predict(x_test)
y_predict3

In [None]:
y_test.values

In [None]:
print('train accuracy:',model3.score(x_train,y_train))
print('test accuracy:',model3.score(x_test,y_test))

4. Gradient boosting Classifier

In [None]:
model4=GradientBoostingClassifier(learning_rate=0.6,max_depth=3,n_estimators=110)
model4.fit(x_train,y_train)

In [None]:
y_predict4=model4.predict(x_test)
y_predict4

In [None]:
y_test.values

In [None]:
print('train accuracy:',model4.score(x_train,y_train))
print('test accuracy:',model4.score(x_test,y_test))

In [None]:
class_report4=classification_report(y_test,y_predict4)
print(class_report4)

In [None]:
df.columns