In [None]:
# import modules
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as mp
from matplotlib import pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns",1000)
pd.set_option('display.max_rows', 17544) 
# Configuring plotting visual and sizes
%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('talk')
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (30, 10),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'}


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate

In [None]:
df=pd.read_csv('https://raw.githubusercontent.com/JattievdLinde/insights/main/data/Data.log')
df.head()
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values(['Timestamp'])

In [None]:
df.describe()

In [None]:
df.head(10)

In [None]:
pd.DataFrame(df.columns,columns =['List of Columns'])

In [None]:
df.nunique()

In [None]:
df = df.drop(columns=['PressureControl','StackTC','MagnetTC','HeliumPressure','RGAChannel1', 'RGAChannel2','RGAChannel3','RGAChannel4', 'RGAChannel5','RGAChannel6'])

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing}).reset_index(drop=True)
missing_value_df

In [None]:
df['Element'] = df['Element'].str.strip()
print(df['Element'].value_counts())
df=df.replace("","Vacuum")
print(df['Element'].value_counts())

In [None]:
df.tail(10)

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
fig = px.line(df, x='Timestamp', y="ProductTemp")
fig.show()

In [None]:
fig = px.line(df, x='Timestamp', y="PSUTop")
fig.show()

In [None]:
fig = px.line(df, x='Timestamp', y="Field")
fig.show()

In [None]:

# Set theme
sns.set_style('whitegrid')
 
# Violin plot
plt.figure(figsize=(12,8)) # Set plot dimensions
sns.violinplot(x='Element', y='ProductTemp', data=df)


In [None]:
# Set theme
sns.set_style('whitegrid')
 
# Violin plot
plt.figure(figsize=(12,8)) # Set plot dimensions
sns.violinplot(x='Element', y='Pressure', data=df)

In [None]:
# Distribution Plot (a.k.a. Histogram)
plt.figure(figsize=(9,6))
sns.histplot(df, x='PSUMiddle',
             kde=True) # Plot the density curve too

In [None]:
avg_temp_element = df.groupby(['Element'])['ProductTemp','Pressure','TempChannel1','Field','Top','PSUTop','PSUMiddle','PSUBottom'].mean().reset_index()
avg_temp_element

In [None]:
conditions = [
    (df['Element'] == "He"),
   (df['Element'] == "Vacuum")
    ]
values = [0,1]
df['Element'] = np.select(conditions, values)

In [None]:
df.columns

In [None]:
X_class = np.array(df[['ProductTemp','Pressure','TempChannel1','Field','Top', 'Middle', 'Bottom','PSUTop','PSUMiddle','PSUBottom','Gas1']])
Y_class = np.array(df[['Element']])

# Transformed dataset is randomly split into 70:30 to accommodate the training and testing phases
x_train_class, x_test_class, y_train_class, y_test_class = train_test_split(X_class,Y_class, test_size = 0.3,random_state=0)
print("X_train has these samples:", x_train_class.shape)
print("X_test has these samples:", x_test_class.shape)
print("Y_train has these samples:", len(y_train_class))
print("Y_train has these samples:", len(y_test_class))

In [None]:
classification_models = ["Random Forest","Decision Tree","K-Nearest Neighbours","LDA","Logistic Regression","Naive Bayes"]

classifiers = [
    RandomForestClassifier(n_estimators=200, criterion='gini', min_samples_split=5, min_samples_leaf=2, max_features='auto', bootstrap=True, n_jobs=-1, random_state=42),
    DecisionTreeClassifier(random_state=42),
    KNeighborsClassifier(weights='distance'),
    LogisticRegression(),
    LinearDiscriminantAnalysis(n_components=1),
    GaussianNB()
    ]


for name, model in zip(classification_models, classifiers):
    model.fit(x_train_class, y_train_class)
    y_pred_class = model.predict(x_test_class)
    print (name, "Model")
    print("% Accuracy for ",name," is ",metrics.accuracy_score(y_test_class, y_pred_class) * 100)
    print("Precision Score ",name," is ",metrics.precision_score(y_test_class, y_pred_class, average='weighted', labels=np.unique(y_pred_class)))
    print("Recall Score ",name," is ",metrics.recall_score(y_test_class, y_pred_class, average='weighted'))
    print("F1-Score Score ",name," is ",metrics.f1_score(y_test_class, y_pred_class, average='weighted', labels=np.unique(y_pred_class)))
    print("Evaluating Cross Validation")
    scores = cross_val_score(model, X_class, Y_class, cv=5)
    print("Scores:", scores)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    print ("mean score and 95% confidence interval of the score estimate are given by")
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("*------------END-----------*")
    
# Generating confusion matrix and visualising using heatmap    
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10,8))
for model, ax, name in zip(classifiers, axes.flatten(), classification_models):
    plot_confusion_matrix(model, x_test_class, y_test_class, ax=ax, values_format=None, cmap='Blues', include_values=True)
    ax.title.set_text(name)  
    ax.set_xlabel('Predicted Class', fontsize=8)
    ax.set_ylabel('Actual Class', fontsize=8)

plt.tight_layout()
plt.show()