In [1]:
import pandas as pd
from tqdm.auto import tqdm

# Load the CSV file into a DataFrame
file_path = 'DataSet.csv'
data = pd.read_csv(file_path, encoding='UTF-8-SIG')

# Display the head of the DataFrame
data_head = data.head()

# Show the head of the DataFrame
print(data_head)

   AGE  GENDER  AORTIC VALVE  LEFT ATRIUM  EDD  ESD    EF  IVS (D)  PW (D)  \
0   52       1             1          3.5  4.5  3.0  60.0      1.2     1.2   
1   21       1             0          3.0  4.3  2.6  66.0      1.0     1.0   
2   55       1             1          3.4  4.5  2.9  65.0      1.2     1.2   
3   42       1             1          3.7  4.8  3.6  35.0      1.2     1.2   
4   38       0             0          3.4  4.3  2.7  64.0      1.0     1.0   

   AORTA  I.A.S  RVSP  RWMA  TARGET  
0    2.7      1    35     0       1  
1    2.7      1    30     0       0  
2    2.2      1    40     0       1  
3    2.8      1    40     1       1  
4    2.7      1    33     0       0  


In [2]:
from tqdm.auto import tqdm

# Check for missing values
missing_values = data.isnull().sum()

# Check for duplicate rows
duplicate_rows = data.duplicated().sum()

# Check for any obvious issues with data types
data_types = data.dtypes

# Summary statistics to identify any outliers or anomalies
summary_statistics = data.describe()

# Display the findings
print('Missing Values:\n', missing_values)
print('\nDuplicate Rows:', duplicate_rows)
print('\nData Types:\n', data_types)
print('\nSummary Statistics:\n', summary_statistics)

Missing Values:
 AGE             0
GENDER          0
AORTIC VALVE    0
LEFT ATRIUM     0
EDD             0
ESD             0
EF              0
IVS (D)         0
PW (D)          0
AORTA           0
I.A.S           0
RVSP            0
RWMA            0
TARGET          0
dtype: int64

Duplicate Rows: 960

Data Types:
 AGE               int64
GENDER            int64
AORTIC VALVE      int64
LEFT ATRIUM     float64
EDD             float64
ESD             float64
EF              float64
IVS (D)         float64
PW (D)          float64
AORTA           float64
I.A.S             int64
RVSP              int64
RWMA              int64
TARGET            int64
dtype: object

Summary Statistics:
                AGE       GENDER  AORTIC VALVE  LEFT ATRIUM          EDD  \
count  1280.000000  1280.000000   1280.000000  1280.000000  1280.000000   
mean     51.878125     0.625000      0.453125     3.471906     4.497813   
std      15.549990     0.484312      0.497992     0.578890     0.331360   
min       7

In [3]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Select columns to scale, typically numerical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Apply scaling to the numerical columns
data_scaled = data.copy()
data_scaled[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Display the head of the scaled DataFrame
data_scaled_head = data_scaled.head()

# Show the head of the scaled DataFrame
print(data_scaled_head)

        AGE    GENDER  AORTIC VALVE  LEFT ATRIUM       EDD       ESD  \
0  0.007841  0.774597      1.098588     0.048549  0.006604 -0.051133   
1 -1.986509  0.774597     -0.910259    -0.815510 -0.597204 -0.179972   
2  0.200842  0.774597      1.098588    -0.124262  0.006604 -0.083343   
3 -0.635498  0.774597      1.098588     0.394173  0.912317  0.142126   
4 -0.892833 -1.290994     -0.910259    -0.124262 -0.597204 -0.147763   

         EF   IVS (D)    PW (D)     AORTA     I.A.S      RVSP      RWMA  \
0 -0.039739  0.651296  0.644538 -0.118439  0.079305  0.025385 -0.553283   
1  0.652633 -1.312102 -1.316967 -0.118439  0.079305 -0.740954 -0.553283   
2  0.537238  0.651296  0.644538 -4.238047  0.079305  0.791724 -0.553283   
3 -2.924624  0.651296  0.644538  0.705483  0.079305  0.791724  1.807392   
4  0.421842 -1.312102 -1.316967 -0.118439  0.079305 -0.281151 -0.553283   

     TARGET  
0  0.596601  
1 -1.676163  
2  0.596601  
3  0.596601  
4 -1.676163  


In [4]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

# Apply PCA for feature extazraction
pca = PCA(n_components=0.95) # Keep 95% of variance
principal_components = pca.fit_transform(data_scaled[numerical_cols])

# Convert to DataFrame for easier handling
pca_df = pd.DataFrame(data=principal_components)

# Apply SelectKBest for feature selection
selector = SelectKBest(f_classif, k='all')
selector.fit(data_scaled[numerical_cols], data_scaled['TARGET'])

# Get the scores for each feature
feature_scores = selector.scores_

# Convert to DataFrame for easier handling
feature_scores_df = pd.DataFrame({'Feature': numerical_cols, 'Score': feature_scores})

# Sort the DataFrame by the scores in descending order
feature_scores_df = feature_scores_df.sort_values(by='Score', ascending=False)

# Display the PCA DataFrame head and the feature scores
print('PCA DataFrame (head):')
print(pca_df.head())
print('\nFeature Scores:')
print(feature_scores_df)

PCA DataFrame (head):
         0         1         2         3         4         5         6   \
0 -0.933810 -0.766635  0.361479 -0.274011 -0.253074  0.137433  0.507175   
1  3.354016  0.932505 -0.128299 -0.785179 -0.366485  0.330822  0.316016   
2 -0.408380 -1.027156  2.215161  1.529083 -0.095618  1.492463  1.767276   
3 -2.870323  1.899700  0.589156 -0.102978 -0.991356  0.214460  0.052823   
4  2.746999  0.726967 -0.646917  0.558223  0.599754 -0.841184 -0.266591   

         7         8         9         10  
0 -0.974882 -0.015375 -0.039891 -0.554265  
1 -0.107148 -0.067071  0.550617 -0.532475  
2 -1.530093 -2.176816 -0.371606  0.678073  
3  0.478801  0.256710  0.574496 -0.757207  
4  0.478233 -0.384944  0.301600  0.181355  

Feature Scores:
         Feature         Score
13        TARGET  1.027786e+18
8         PW (D)  5.924585e+02
7        IVS (D)  5.829282e+02
2   AORTIC VALVE  5.345481e+02
0            AGE  1.777585e+02
12          RWMA  1.562772e+02
11          RVSP  9.537737e+0

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Load your dataset
data_scaled = pd.read_csv('DataSet.csv')

# Define your features (X) and target variable (y)
X = data_scaled.drop('TARGET', axis=1)
y = (data_scaled['TARGET'] > 0).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100)

# Fit the Random Forest classifier on the training data
rf_clf.fit(X_train, y_train)

# Instantiate the SVM classifier
svm_clf = SVC(kernel='linear')  # You can choose a different kernel based on your needs

# Fit the SVM classifier on the training data
svm_clf.fit(X_train, y_train)

# Now, you can use both trained models for predictions or evaluation on the test set


# Assuming 'data_scaled.csv' is your dataset file
data_scaled = pd.read_csv('DataSet.csv')

# Define your features (X) and target variable (y)
X = data_scaled.drop('TARGET', axis=1)
y = (data_scaled['TARGET'] > 0).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100)

# Fit the Random Forest classifier on the training data
rf_clf.fit(X_train, y_train)

# Now, you can use the trained model for predictions or evaluation on the test set

# Convert the target variable back to categorical
y = (data_scaled['TARGET'] > 0).astype(int)

# Split the data into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Retrain the Random Forest classifier
rf_clf.fit(X_train, y_train)
# Retrain the SVM classifier
svm_clf.fit(X_train, y_train)

# Make predictions with both classifiers
rf_predictions = rf_clf.predict(X_test)
svm_predictions = svm_clf.predict(X_test)

# Combine predictions - here we will simply average them
hybrid_predictions = (rf_predictions + svm_predictions) / 2

# Evaluate the hybrid model
hybrid_accuracy = accuracy_score(y_test, hybrid_predictions.round())

# Print the accuracy of the hybrid model
print('Hybrid Model Accuracy:', hybrid_accuracy)

Hybrid Model Accuracy: 0.97265625


In [6]:


# Convert the target variable back to categorical
X = data_scaled.drop('TARGET', axis=1)
y = (data_scaled['TARGET'] > 0).astype(int)

# Split the data into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Retrain the Random Forest classifier
rf_clf.fit(X_train, y_train)
# Retrain the SVM classifier
svm_clf.fit(X_train, y_train)

# Make predictions with both classifiers
rf_predictions = rf_clf.predict(X_test)
svm_predictions = svm_clf.predict(X_test)

# Combine predictions - here we will simply average them
hybrid_predictions = (rf_predictions + svm_predictions) / 2

# Evaluate the hybrid model
hybrid_accuracy = accuracy_score(y_test, hybrid_predictions.round())

# Print the accuracy of the hybrid model
print('Hybrid Model Accuracy:', hybrid_accuracy)

Hybrid Model Accuracy: 0.97265625


In [7]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score

class HybridModel(BaseEstimator):
    def __init__(self):
        self.rf_clf = RandomForestClassifier(random_state=42)
        self.svm_clf = SVC(probability=True, random_state=42)

    def fit(self, X, y):
        self.rf_clf.fit(X, y)
        self.svm_clf.fit(X, y)
        return self

    def predict(self, X):
        rf_predictions = self.rf_clf.predict_proba(X)[:, 1]
        svm_predictions = self.svm_clf.decision_function(X)
        hybrid_predictions = (rf_predictions + svm_predictions) / 2
        return (hybrid_predictions > 0.5).astype(int)

# Initialize the hybrid model
hybrid_model = HybridModel()

# Perform cross-validation
scores = cross_val_score(hybrid_model, X, y, cv=5, scoring='accuracy')

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = scores.mean()
std_dev_score = scores.std()

# Display the results
print('Cross-validated scores:', scores)
print('Mean accuracy:', mean_score)
print('Standard deviation:', std_dev_score)

Cross-validated scores: [0.984375   0.984375   0.98046875 0.9765625  0.98828125]
Mean accuracy: 0.9828125
Standard deviation: 0.003983608994994363


In [8]:
# Fit the hybrid model on the entire dataset

hybrid_model.fit(X, y)

# Predict on the test set
y_pred = hybrid_model.predict(X_test)

# Calculate performance metrics
from sklearn.metrics import classification_report
performance_metrics = classification_report(y_test, y_pred, output_dict=True)

# Convert performance metrics to a DataFrame for better visualization
import pandas as pd
performance_df = pd.DataFrame(performance_metrics).transpose()

# Print all the output performance metrics
print(performance_df)

              precision    recall  f1-score     support
0              0.946429  0.946429  0.946429   56.000000
1              0.985000  0.985000  0.985000  200.000000
accuracy       0.976562  0.976562  0.976562    0.976562
macro avg      0.965714  0.965714  0.965714  256.000000
weighted avg   0.976562  0.976562  0.976562  256.000000
