In [8]:
#Data Science Capstone
#Data Import and Preparation:
import pandas as pd

# Import the data
data = pd.read_csv('health care diabetes.csv')  # Replace 'your_dataset.csv' with the actual file path

# Identify the primary key
primary_key = 'your_primary_key_column_name'  # Replace 'your_primary_key_column_name' with the actual column name

# Assess the fill rate of variables and handle missing values
fill_rates = data.isnull().mean()  # Calculate the proportion of missing values for each variable
missing_variables = fill_rates[fill_rates > 0].index.tolist()  # Get variables with missing values

# Handle missing values (example: fill missing values with mean/median)
for variable in missing_variables:
    if data[variable].dtype == 'object':
        data[variable].fillna(data[variable].mode()[0], inplace=True)  # Fill missing categorical values with mode
    else:
        data[variable].fillna(data[variable].median(), inplace=True)  # Fill missing numerical values with median

# Print the fill rate of variables
print("Fill rates:")
print(fill_rates)

# Print the updated dataset with filled missing values
print("Updated dataset:")
print(data.head())



Fill rates:
Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64
Updated dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4              

In [9]:
import pandas as pd

# Import the data
data = pd.read_csv('health care diabetes.csv')

# Identify the primary key
primary_key = None

# Check for unique identifier columns
for column in data.columns:
    if data[column].nunique() == len(data):
        primary_key = column
        break

if primary_key is not None:
    print("Primary key identified:", primary_key)
else:
    print("No primary key identified.")

# Assess the need for indexing
indexing_required = []

# Check for columns that may benefit from indexing
for column in data.columns:
    unique_values_ratio = data[column].nunique() / len(data)
    if unique_values_ratio > 0.5:
        indexing_required.append(column)

if len(indexing_required) > 0:
    print("Columns that may require indexing:", indexing_required)
else:
    print("No columns require indexing.")


No primary key identified.
Columns that may require indexing: ['DiabetesPedigreeFunction']


In [18]:
#Gauge the fill rate of the variables and devise plans for missing value treatment. Please explain explicitly the reason for the treatment chosen for each variable.

import pandas as pd

# Import the data
data = pd.read_csv('health care diabetes.csv')

# Calculate the fill rate of variables
fill_rate = data.count() / len(data) * 100

# Print fill rate for each variable
print("Fill rate of variables:")
print(fill_rate)

# Devise plans for missing value treatment
missing_value_treatment = {}

# Analyze the fill rate and choose appropriate treatment for each variable
for column in data.columns:
    if fill_rate[column] >= 90:
        # If fill rate is high (>=90%), we can consider dropping the column
        missing_value_treatment[column] = "Drop"
    elif fill_rate[column] >= 50:
        # If fill rate is moderate (>=50%), we can consider imputing missing values with mean or median
        missing_value_treatment[column] = "Impute with mean/median"
    else:
        # If fill rate is low (<50%), we may need to analyze further or drop the column
        missing_value_treatment[column] = "Further analysis or drop"

# Print missing value treatment for each variable
print("\nMissing value treatment:")
for column, treatment in missing_value_treatment.items():
    print(f"{column}: {treatment}")


Fill rate of variables:
Pregnancies                 100.0
Glucose                     100.0
BloodPressure               100.0
SkinThickness               100.0
Insulin                     100.0
BMI                         100.0
DiabetesPedigreeFunction    100.0
Age                         100.0
Outcome                     100.0
dtype: float64

Missing value treatment:
Pregnancies: Drop
Glucose: Drop
BloodPressure: Drop
SkinThickness: Drop
Insulin: Drop
BMI: Drop
DiabetesPedigreeFunction: Drop
Age: Drop
Outcome: Drop


In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('health care diabetes.csv')

# Separate the features (X) and the target variable (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)


Accuracy: 0.7142857142857143


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data from the "health care diabetes.csv" file
data = pd.read_csv('health care diabetes.csv')

# Filter locations where percent ownership is above 10% and percent of households with a second mortgage is below 50%
filtered_data = data[(data['PercentOwnership'] > 10) & (data['SecondMortgage'] < 50)]

# Sort the data by percentage of households with a second mortgage in descending order
sorted_data = filtered_data.sort_values('SecondMortgage', ascending=False).head(2500)

# Visualize the top 2,500 locations on a geo-map
# Use latitude and longitude columns to plot the locations on a map
plt.figure(figsize=(10, 8))
sns.scatterplot(data=sorted_data, x='Longitude', y='Latitude', hue='SecondMortgage', size='PercentOwnership', alpha=0.8)
plt.title('Locations with High Second Mortgage Percentage')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Calculate bad debt
sorted_data['BadDebt'] = sorted_data['SecondMortgage'] + sorted_data['HomeEquity'] - sorted_data['HomeEquitySecondMortgage']

# Pie chart for overall debt
plt.figure(figsize=(8, 6))
sorted_data[['SecondMortgage', 'HomeEquity', 'BadDebt']].sum().plot(kind='pie', autopct='%1.1f%%', labels=None)
plt.title('Overall Debt')
plt.legend(labels=['Second Mortgage', 'Home Equity', 'Bad Debt'])
plt.show()

# Pie chart for bad debt
plt.figure(figsize=(6, 6))
sorted_data[['BadDebt']].sum().plot(kind='pie', autopct='%1.1f%%', labels=None)
plt.title('Bad Debt')
plt.legend(labels=['Bad Debt'])
plt.show()

# Box and whisker plot for different debt types in different cities
plt.figure(figsize=(10, 8))
sns.boxplot(data=sorted_data, x='City', y=['SecondMortgage', 'HomeEquity', 'GoodDebt', 'BadDebt'])
plt.title('Debt Types in Different Cities')
plt.xlabel('City')
plt.ylabel('Debt Amount')
plt.show()

# Collated income distribution chart
plt.figure(figsize=(10, 8))
sns.kdeplot(data=sorted_data, x='FamilyIncome', shade=True)
sns.kdeplot(data=sorted_data, x='HouseholdIncome', shade=True)
sns.kdeplot(data=sorted_data, x='RemainingIncome', shade=True)
plt.title('Income Distribution')
plt.xlabel('Income')
plt.ylabel('Density')
plt.legend(labels=['Family Income', 'Household Income', 'Remaining Income'])
plt.show()


KeyError: 'PercentOwnership'