 # <center> Analyzing Fitness Survey Data </center>

#### In this notebook, I will be visualizing some of the key features of this fitness survey data. Lets start by reading in the data and cleaning it for further analysis

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("../input/fitness-analysis/fitness analysis.csv")

In [None]:
df.info()


#### Looks like the dataset doesnt have any missing values. However, the column names are too long, so lets shorten them by using abbreviations.

In [None]:
df.head()

In [None]:
df.columns

In [None]:
new_cols=['Timestamp','Name','Gender','Age','Exercise_importance','Fitness_level','Regularity','Barriers','Exercises','Do_you','Time','Time_spent','Balanced_diet','prevents_balanced','Health_level','Recommend_fitness','Equipment','Motivation']

In [None]:
column_reference=pd.DataFrame(new_cols,df.columns)
column_reference

#### We'll save the old column names as a dataframe for reference

In [None]:
df.columns=new_cols

#### Since we do not need the timestamp and names of participants for our analysis, lets drop these columns

In [None]:
df.drop(columns=['Timestamp','Name'],inplace=True)

In [None]:
df.head()

### 1. Importance of Exercise for different age groups

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
age_vals=df["Age"].unique()
grid = sns.FacetGrid(df, col='Age',col_order=age_vals[[1,0,4,3,2]])
grid.map(sns.distplot,'Exercise_importance')
grid.add_legend()
grid.set(xlim=(0,5))
grid.set(ylim=(0,1))
grid.set(xlabel="Importance of Exercise")
grid.despine()
plt.show()

### 2. Types of Exercises preferred by participants
#### Since participants could select multiple options for this question,lets create a dictionary to find the top preferred exercise.

In [None]:
exercises_list={}
for selected_options in df['Exercises']:
    for exercise in selected_options.split(";"):
        if exercise in exercises_list:
            exercises_list[exercise]+=1
        else:
            exercises_list[exercise]=1

            
        

In [None]:
sorted_list={}
for i in sorted(exercises_list,key=exercises_list.get,reverse=True):
    sorted_list[i]=exercises_list[i]
     

#### Now, lets convert the sorted dictionary into percentages and plot it on a bar graph.

In [None]:
count=sum(sorted_list.values())
for i in sorted_list:
    sorted_list[i]=(sorted_list[i]/count)*100

In [None]:
sorted_list

In [None]:
plt.bar(sorted_list.keys(),sorted_list.values())
plt.xticks(rotation=90)
plt.title("Exercise preferred by Participants")
plt.ylabel("Percentage")
plt.ylim(0,100)
plt.show()

### 3. At what time do people prefer to exercise?

In [None]:
times=df["Time"].value_counts(normalize=True)*100
plt.pie(times,labels=times.index,explode=(0.05,0.05,0.1),shadow=True,autopct='%.1f%%',startangle=90)
plt.title("Preferred Time to exercise")
plt.show()

### 4. Average time spent per day for Exercise?

In [None]:
df["Time_spent"].unique()

#### To find out approximately, the average time spent by an induvidual on exercise daily, lets convert the 'Time_spent' column into numerical values (minutes). 

In [None]:
times=df["Time_spent"].str.split(" ",n=1,expand=True)

In [None]:
df["Time_spent_minutes"]=times[0]

In [None]:
def convertor(val):
    if val=="I":
        return 0
    else:
        return int(val)

In [None]:
df["Time_spent_minutes"]=df["Time_spent_minutes"].apply(convertor)

In [None]:
df["Time_spent_minutes"]=df["Time_spent_minutes"].apply(lambda x:x*60 if x!=30 else x)

In [None]:
df["Time_spent_minutes"].value_counts()

In [None]:
df.groupby("Gender").mean()["Time_spent_minutes"].plot.barh()
for i,v in enumerate(df.groupby("Gender").mean()["Time_spent_minutes"]):
    plt.text(v,i,(str(round(v,2))+" mins"))
plt.title("Average time spent daily on Exercise by Gender")
plt.xlabel("Time (Minutes)")
plt.xlim(0,70)
plt.show()

In [None]:
groups=df.groupby("Age").mean()["Time_spent_minutes"]
ax=sns.barplot(groups.index,groups)
ax.text(0,48,"Total Average=45.6 minutes",c="purple")
plt.title("Average time spent on Exercise daily by Age")
plt.ylim(10,60)
plt.axhline(df["Time_spent_minutes"].mean(),color="purple")
plt.show()

### 5. Top motivation factors for daily excersice?

In [None]:
motivation_list={}
for selected_options in df['Motivation']:
    for motivation in selected_options.split(";"):
        if motivation in motivation_list:
            motivation_list[motivation]+=1
        else:
            motivation_list[motivation]=1

In [None]:
motivation_list

In [None]:
top_5_motivation=pd.DataFrame.from_dict(motivation_list.items()).sort_values(by=1,ascending=False)[:5]

In [None]:
top_5_motivation[0]=top_5_motivation[0].apply(lambda x:x.replace("I want to ",""))

In [None]:
sns.barplot(x=0,y=1,data=top_5_motivation)
plt.xticks(rotation=45)
plt.ylabel("Number of responses")
plt.title("Top five reasons to exercise")
plt.xlabel("")
plt.show()

### 6. Top Barriers from performing daily excercise?

In [None]:
barrier_list={}
for selected_options in df['Barriers']:
    for barrier in selected_options.split(";"):
        if barrier in barrier_list:
            barrier_list[barrier]+=1
        else:
            barrier_list[barrier]=1

In [None]:
barrier_list

In [None]:
top_5_barrier=pd.DataFrame.from_dict(barrier_list.items()).sort_values(by=1,ascending=False)[:6].drop(3,axis=0)

#### We'll drop the fouth row as it just mentions there arent any barriers

In [None]:
top_5_barrier["Percentage"]=(top_5_barrier[1]/top_5_barrier[1].sum())*100

In [None]:
top_5_barrier

In [None]:
plt.pie(top_5_barrier["Percentage"],labels=top_5_barrier[0],autopct="%.1f%%",shadow=True)
centre_circle = plt.Circle((0,0),0.8,color='yellow', fc='white',linewidth=1.5)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title("Top 5 barriers to daily exercise")
plt.show()