In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/heart.csv")

In [None]:
df.head()

In [None]:
df.target.value_counts()

In [None]:
sns.countplot(x="target", data=df)
plt.show()

In [None]:
NoDisease = len(df[df.target==0])
HaveDisease = len(df[df.target==1])
print("%age of Patient have not heart disease: {:.2f}%".format((NoDisease/(len(df.target)))*100))
print("%age of Patient have heart disease: {:.2f}%".format((HaveDisease/(len(df.target)))*100))

In [None]:
sns.countplot(x='sex', data=df)
plt.xlabel("Sex (0 = female, 1 = male)")
plt.show()

In [None]:
Female = len(df[df.sex==0])
Male = len(df[df.sex==1])
print("%age of female: {:.2f}%".format((Female/(len(df.sex)))*100))
print("%age of male: {:.2f}%".format((Male/(len(df.sex)))*100))

In [None]:
df.groupby('target').mean()

In [None]:
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6))
plt.title('Heart Disease Frequency for Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('heartDiseaseAndAges.png')
plt.show()

In [None]:
pd.crosstab(df.sex,df.target).plot(kind="bar",figsize=(15,6))
plt.title('Heart Disease Frequency for Sex')
plt.xlabel('Sex (0 = Female, 1 = Male)')
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(12,12))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
!pip install pyspark

In [None]:
import pyspark
sc = pyspark.SparkContext(appName="Heart")

In [None]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
sdf = sqlContext.createDataFrame(df)

In [None]:
sdf.show()

In [None]:
sdf.dtypes

In [None]:
sdf.printSchema()

In [None]:
sdf.groupBy("age").count().sort("age",ascending=False).show()

In [None]:
from pyspark.ml.classification import LogisticRegression
sdf.printSchema()

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
 
assembler=VectorAssembler(inputCols=['age','sex','cp','trestbps','chol','fbs','restecg','exang','oldpeak','slope','ca','thal'],outputCol='features')
 
output_data=assembler.transform(sdf)

In [None]:
output_data.printSchema()

In [None]:
final_data=output_data.select('features','target')         
train,test=final_data.randomSplit([0.7,0.3])          
model=LogisticRegression(labelCol='target')           
model=model.fit(train)        
summary=model.summary
summary.predictions.describe().show()   

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
predictions=model.evaluate(test)
evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='target')
acc = evaluator.evaluate(predictions.predictions)
print("Accuracy = ",acc*100)

In [None]:
sc.stop()