In [None]:
import pandas as pd 
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

# Dog Movement Analysis
Obtained from: https://www.sciencedirect.com/science/article/pii/S2352340922000348

## Load, Clean, and Split Dog Data

In [None]:
dog_raw = pd.read_csv("DogMoveData.csv")

In [None]:
dog_raw['second'] = dog_raw['t_sec'].round()
dog_raw['minute'] = dog_raw['t_sec'].round() // 60
dog_clean = dog_raw.loc[:, ['DogID', 'TestNum', 'minute', 'Task', 
                       'ABack_x', 'ABack_y', 'ABack_z',
                       'ANeck_x', 'ANeck_y', 'ANeck_z',
                       'GBack_x', 'GBack_y', 'GBack_z',
                       'GNeck_x', 'GNeck_y', 'GNeck_z'
                      ]]
dog_clean = dog_clean.groupby(['DogID', 'TestNum', 'minute', 'Task']).agg(['mean', 'std', 'min', 'max'])
dog_clean.columns = ['_'.join(col).strip() for col in dog_clean.columns.values]
dog_clean = dog_clean.reset_index()
dog_clean = dog_clean.loc[dog_clean.Task != '<undefined>', :].dropna()
dog_clean.Task = dog_clean.Task.astype('category')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dog_clean.drop(columns=['Task', 'DogID', 'TestNum', 'minute']), dog_clean.Task, test_size=0.2, random_state=42)

x_vars = X_train.columns

standardize = StandardScaler()
X_train = standardize.fit_transform(X_train)
X_test = standardize.transform(X_test)

X_train = pd.DataFrame(X_train, columns=x_vars)
X_test = pd.DataFrame(X_test, columns=x_vars)

train = X_train.copy() 
train['Task'] = y_train

print("training data shape", train.shape)
print("test samples", y_test.shape)

## Dog Data Visualizations

In [None]:
train.value_counts('Task').plot.barh()
plt.title("Distribution of Tasks")
plt.xlabel("Count")

The fairly even distribution of classes suggest that no class imbalance techniques need to be employed

In [None]:
train_long = train.melt(id_vars='Task')
train_long[['bodypart', 'direction', 'metric']] = train_long['variable'].str.split('_', expand=True)
train_long['device'] = train_long['bodypart'].str[0]
train_long['bodypart'] = train_long['bodypart'].str[1:]

In [None]:
accel_mean = train_long.loc[(train_long.device == "A") & (train_long.metric == 'mean'), :]
accel_grid = sns.FacetGrid(accel_mean, col="bodypart", row='direction', hue="Task", sharex=False, sharey=False)
accel_grid.map_dataframe(sns.kdeplot, x="value")
accel_grid.add_legend()
accel_grid.set(xlim=(-2,2))
accel_grid.fig.suptitle("Mean Acceleration", y = 1.05)

In [None]:
accel_std = train_long.loc[(train_long.device == "A") & (train_long.metric == 'std'), :]
accel_grid = sns.FacetGrid(accel_std, col="bodypart", row='direction', hue="Task", sharex=False, sharey=False)
accel_grid.map_dataframe(sns.kdeplot, x="value")
accel_grid.add_legend()
accel_grid.set(xlim=(-2,2))
accel_grid.fig.suptitle("Standard Deviation of Acceleration", y = 1.05)

In [None]:
accel_max = train_long.loc[(train_long.device == "A") & (train_long.metric == 'max'), :]
accel_grid = sns.FacetGrid(accel_max, col="bodypart", row='direction', hue="Task", sharex=False, sharey=False)
accel_grid.map_dataframe(sns.kdeplot, x="value")
accel_grid.add_legend()
accel_grid.set(xlim=(-2,2))
accel_grid.fig.suptitle("Max Acceleration", y = 1.05)

In [None]:
gyro_mean = train_long.loc[(train_long.device == "G") & (train_long.metric == 'mean') & (train_long.Task != "Task lie down") & (train_long.Task != "Task sit") & (train_long.Task != "Task stand"), :]
gyro_grid = sns.FacetGrid(gyro_mean, col="bodypart", row='direction', hue="Task", sharex=False, sharey=False)
gyro_grid.map_dataframe(sns.kdeplot, x="value")
gyro_grid.add_legend()
gyro_grid.set(xlim=(-20,20))
gyro_grid.fig.suptitle("Mean Gyroscope", y = 1.05)

In [None]:
gyro_std = train_long.loc[(train_long.device == "G") & (train_long.metric == 'std') & (train_long.Task != "Task lie down") & (train_long.Task != "Task sit") & (train_long.Task != "Task stand"), :]
gyro_grid = sns.FacetGrid(gyro_std, col="bodypart", row='direction', hue="Task", sharex=False, sharey=False)
gyro_grid.map_dataframe(sns.kdeplot, x="value")
gyro_grid.add_legend()
gyro_grid.set(xlim=(-20,20))
gyro_grid.fig.suptitle("Standard Deviation Gyroscope", y = 1.05)

In [None]:
gyro_max = train_long.loc[(train_long.device == "G") & (train_long.metric == 'max') & (train_long.Task != "Task lie down") & (train_long.Task != "Task sit") & (train_long.Task != "Task stand"), :]
gyro_grid = sns.FacetGrid(gyro_max, col="bodypart", row='direction', hue="Task", sharex=False, sharey=False)
gyro_grid.map_dataframe(sns.kdeplot, x="value")
gyro_grid.add_legend()
gyro_grid.set(xlim=(-20,20))
gyro_grid.fig.suptitle("Max Gyroscope", y = 1.05)

In [None]:
sns.scatterplot(data=train, x='ABack_y_mean', y = 'ABack_z_mean', hue='Task')
plt.title("True Data Points Between Mean Z\nAcceleration vs Mean Y Acceleration on Back")
plt.xlabel("Mean Y Acceleration")
plt.ylabel("Mean Z Acceleration")

In [None]:
sns.scatterplot(data=train, x='ABack_x_mean', y = 'ABack_y_mean', hue='Task')

In [None]:
sns.scatterplot(data=train, x='ANeck_x_mean', y = 'ANeck_y_mean', hue='Task')

In [None]:
sns.scatterplot(data=train, x='ANeck_y_mean', y = 'ANeck_z_mean', hue='Task')

In [None]:
sns.scatterplot(data=train, x='ABack_y_mean', y = 'ANeck_z_mean', hue='Task')

Most plots are hard to distinguish separations among the data, however some variables and plots show more than others. In general the y and z measurements show more separation.

## SVC (One vs One) on Dog Data

In [None]:
dog_svc_grid = {
    'C': [0.1, 1, 10],
    'kernel' : ['rbf', 'poly']
}
dog_svc_cv = GridSearchCV(svm.SVC(), dog_svc_grid, cv=10, n_jobs = -1, verbose=True)
dog_svc_cv.fit(X_train, y_train)
print("Test Accuracy:", dog_svc_cv.score(X_test, y_test))

In [None]:
pd.DataFrame(dog_svc_cv.cv_results_).loc[:, ['param_C', 'param_kernel', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'std_fit_time']].sort_values('mean_test_score', ascending=False)

## Linear SVC (One vs Rest) on Dog Data

In [None]:
dog_linear_svc_grid = {
    'C' : [0.1, 1, 10, 100]
}
dog_linearsvc_cv = GridSearchCV(svm.LinearSVC(), dog_linear_svc_grid, n_jobs=-1, verbose=True)
dog_linearsvc_cv.fit(X_train, y_train)
print("Test Accuracy:", dog_linearsvc_cv.score(X_test, y_test))

In [None]:
pd.DataFrame(dog_linearsvc_cv.cv_results_).loc[:, ['param_C', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'std_fit_time']].sort_values('mean_test_score', ascending=False)

## Logistic Regression (One vs Rest) on Dog Data

In [None]:
dog_log_grid = {
    'C' : [0.01, 0.1, 1, 10]
}
dog_log_cv = GridSearchCV(LogisticRegression(max_iter=1000), dog_log_grid, cv = 10, n_jobs=-1, verbose=True)
dog_log_cv.fit(X_train, y_train)
print("Test Accuracy:", dog_log_cv.score(X_test, y_test))

In [None]:
pd.DataFrame(dog_log_cv.cv_results_).loc[:, ['param_C', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'std_fit_time']].sort_values('mean_test_score', ascending=False)

## Confusion Matrix and Metrics on Dog Data

In [None]:
y_pred = dog_log_cv.predict(X_test)
y_labels = ['Task treat-search', 'Task play', 'Task stand', 'Task trot', 'Task sit', 'Task lie down', 'Task walk']
p, r, f, s = precision_recall_fscore_support(y_test, y_pred, labels=y_labels)
results = pd.DataFrame({'Task':y_labels, 'Precision':p, 'Recall':r, "F1":f, "Support":s})
results

In [None]:
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=y_test.cat.categories, yticklabels=y_test.cat.categories)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.title("Confusion Matrix Of Test Set")

All three models show greater accuracy on the test set than from the 10-fold cross validation. The SVC (OVO) shows the greates accuracy but also the greatest variablity in model performance. The Linear SVC (OVR) shows the lowest accuracy, but also the least amount of variance, suggest high bias and low variance. 

The RBF kernels outperform almost all of the polynomial kernels. Only degree 3 polynomials were considered however. 

The Logistic Regression model took the longest to run on average, about twice as long as the SVC (OVO) model. However, with larger sample sizes (such as on the raw time series data itself), the SVC (OVO) model was unfeasable to even run. The complexity of that model grows much faster than the others due to the OVO structure.

The confusion matrix, precision, and recall, of class show good results. Most commonly confused activites are less active activites such as among lying down, sitting, or standing when little movement is recorded.

## Plot Decision Boundaries on Dog Data Variables

In [None]:
grid_predictions = pd.DataFrame(np.random.uniform(-10, 10, (10000, X_train.shape[1])))
grid_predictions.columns = X_train.columns
grid_predictions['Task'] = dog_linearsvc_cv.predict(grid_predictions)

In [None]:
sns.scatterplot(data=grid_predictions, x='ABack_y_mean', y = 'ABack_z_mean', hue='Task')
plt.title("Predicted Points Between Mean Z\nAcceleration vs. Mean Y Acceleration On Back")
plt.xlabel("Mean Y Acceleration")
plt.ylabel("Mean Z Acceleration")
# plt.xlim(-2.5, 3.1)
plt.ylim(-10.5, 1)

This plot shows some decision boundaries or clustering of classes. Sitting (green) is certainly more on the left side.

In [None]:
sns.scatterplot(data=grid_predictions, x='ABack_x_mean', y = 'ABack_y_mean', hue='Task')

In [None]:
sns.scatterplot(data=grid_predictions, x='ANeck_x_mean', y = 'ANeck_y_mean', hue='Task')

In [None]:
sns.scatterplot(data=grid_predictions, x='ANeck_y_mean', y = 'ANeck_z_mean', hue='Task')

In [None]:
sns.scatterplot(data=grid_predictions, x='ABack_y_mean', y = 'ANeck_z_mean', hue='Task')

Heart Attack Analysis & Prediction Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

df = pd.read_csv('heart.csv')
print(df.head())

# Calculate correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

# Define explanatory variables and target variable
X = df[['cp', 'thalachh']]
y = df['output']

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features for SVM
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
model = SVC(kernel='linear', class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Support Vector Machine Classifier Results")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot decision boundary for 'cp' and 'thalachh'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='cp', y='thalachh', hue='output', data=df, palette='coolwarm')

# Generate a grid to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(X['cp'].min(), X['cp'].max(), 100),
                     np.linspace(X['thalachh'].min(), X['thalachh'].max(), 100))
Z = model.predict(scaler.transform(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
plt.title("Decision Boundary for 'cp' and 'thalachh'")
plt.xlabel('cp')
plt.ylabel('thalachh')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

df = pd.read_csv('heart.csv')
print("Data Preview:")
print(df.head())

# Calculate correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

# Define explanatory variables and target variable
X = df[['cp', 'thalachh', 'exng', 'oldpeak']]
y = df['output']

# Pair plot for the four features against the target
sns.pairplot(df, vars=['cp', 'thalachh', 'exng', 'oldpeak'], hue='output', palette='coolwarm')
plt.suptitle("Pair Plot of Selected Features", y=1.02)
plt.show()

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features for SVM
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
model = SVC(kernel='linear', class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Support Vector Machine Classifier Results")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')  # Ensure the file path is correct

In [None]:
# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Visualize the survival rate
sns.countplot(x='Survived', data=data)
plt.title("Survival Counts")
plt.show()

# Visualize survival by gender
sns.countplot(x='Survived', hue='Sex', data=data)
plt.title("Survival Counts by Gender")
plt.show()

# Visualize survival by class
sns.countplot(x='Survived', hue='Pclass', data=data)
plt.title("Survival Counts by Class")
plt.show()

In [None]:
# Drop unnecessary columns
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Fill missing values in 'Age' and 'Embarked'
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Convert 'Sex' and 'Embarked' to numeric values
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

# Define features and target variable
X = data.drop('Survived', axis=1)  # Explanatory variables
y = data['Survived']                # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Fit logistic regression model
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_logistic = logistic_model.predict(X_test_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logistic))

In [None]:
# Fit SVM with RBF kernel
svm_model_rbf = SVC(kernel='rbf', class_weight='balanced')
svm_model_rbf.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_svm_rbf = svm_model_rbf.predict(X_test_scaled)
print("SVM with RBF Kernel Classification Report:")
print(classification_report(y_test, y_pred_svm_rbf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_rbf))

In [None]:
# Cross-validation for Logistic Regression
logistic_cv_scores = cross_val_score(logistic_model, X_train_scaled, y_train, cv=5)
print("Logistic Regression CV Accuracy:", logistic_cv_scores.mean())

# Cross-validation for SVM with RBF kernel
svm_cv_scores = cross_val_score(svm_model_rbf, X_train_scaled, y_train, cv=5)
print("SVM with RBF CV Accuracy:", svm_cv_scores.mean())

In [None]:
# Fit SVM with adjusted class weights
svm_model_weighted = SVC(kernel='rbf', class_weight={0: 1, 1: 2})  # Example of class weight adjustment
svm_model_weighted.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_weighted = svm_model_weighted.predict(X_test_scaled)
print("SVM with Weighted Classes Classification Report:")
print(classification_report(y_test, y_pred_weighted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Fit OVR (Logistic Regression)
ovr_model = OneVsRestClassifier(LogisticRegression(max_iter=200))
ovr_model.fit(X_train_scaled, y_train)
y_pred_ovr = ovr_model.predict(X_test_scaled)
ovr_accuracy = accuracy_score(y_test, y_pred_ovr)
print(f"OVR Logistic Regression Accuracy: {ovr_accuracy:.4f}")

# Fit OVO (SVM)
ovo_model = SVC(kernel='rbf')
ovo_model.fit(X_train_scaled, y_train)
y_pred_ovo = ovo_model.predict(X_test_scaled)
ovo_accuracy = accuracy_score(y_test, y_pred_ovo)
print(f"OVO SVM Accuracy: {ovo_accuracy:.4f}")