### I took a survey and asked multiple people a serious of questions and then checked if they prefer WFH or WFO. Using these same features I have tried to see how much my model can predict. The target column has the right output with which we will train our model. We then test the model and compare the actual and predicted values 

In [None]:
import numpy as np   #Linear algera Library
import pandas as pd
import matplotlib.pyplot as plt  #to plot graphs
import seaborn as sns  #to plot graphs
sns.set()  #setting seaborn as default 

import warnings
warnings.filterwarnings('ignore')

## Step 2. Fetch your dataset

In [None]:
data = pd.read_csv('../input/predict-if-people-prefer-wfh-verses-wfo-data/WFH_WFO_dataset.csv')
data.head()

In [None]:
data.dtypes

In [None]:
data.columns

## Step 3: Understand your data set

#### Here we can see that 123 people wants to go back to office and 84 people would like work from home to continue

In [None]:
#WFH/remote work for sure -1
#I'd rather go back to office -0
ax=sns.countplot(x='Target',data=data)
plt.title('WFH vs WFO')
for p in ax.patches:
        ax.annotate('{:}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+3)) 

#### We have a count of 107 Females and 100 Males 

In [None]:
ax=sns.countplot(x='Gender',data=data)
plt.title('Male vs Female')
for p in ax.patches:
        ax.annotate('{:}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+3)) 

In [None]:
data.head(3)

#### Majority of the people in our data set does not have kids

In [None]:
ax=sns.countplot(x='kids',data=data)
plt.title('kids vs no kids')
for p in ax.patches:
        ax.annotate('{:}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+3)) 

In [None]:
sns.catplot(x='Age',y='Target',data=data,hue='kids')

#### We can see that remote working has caused stress levels to increase for most people 

In [None]:
ax=sns.countplot(x='calmer_stressed',data=data)
plt.title('calmer vs stressed')
for p in ax.patches:
        ax.annotate('{:}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+3)) 

In [None]:
data.head(1)

#### Most people have said that WFH has made them more productive (Rank:4)

In [None]:
ax=data.groupby(["RM_productive"]).count().plot.bar(figsize=(18,5))

#### The below graphs gives the count of people in each occupation 

In [None]:
ax=sns.countplot(x='Occupation',data=data)
plt.title("Occupation count")
for p in ax.patches:
        ax.annotate('{:}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+1)) 

In [None]:
data.describe()

## Step 4: Check for null values

In [None]:
# Method 1
data.info() # no null

In [None]:
#Method 2
data.isnull().sum()

## Step 5: Check for categorical data

In [None]:
data.nunique()

In [None]:
data.head(5)

In [None]:
data.dtypes

#### Get dummy values for Gender and calmer_stressed column

In [None]:
#1. Gender
gender=pd.get_dummies(data['Gender'],drop_first=True)
gender.head()
#FEMALE=0,MALE=1

In [None]:
calm_stress=pd.get_dummies(data['calmer_stressed'],drop_first=True)
calm_stress.head()

In [None]:
data=pd.concat([data,gender,calm_stress],axis=1)
data.drop(['Gender','calmer_stressed'],axis=1,inplace=True)

In [None]:
data.head(5)

In [None]:

data.dtypes

### for the YES and NO columns, map yes to 1 and No to 0

In [None]:
#first fetch all the categorical columns with Yes and NO
categorical =  ['Same_ofiice_home_location', 'kids', 'RM_save_money', 'RM_quality_time', 'RM_better_sleep', 'digital_connect_sufficient','RM_job_opportunities']
#write a function to change yes to 1 and no to 0
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# now replace yes and no with 1 and 0 in our dataset
data[categorical] = data[categorical].apply(binary_map)



In [None]:
data.head()

In [None]:
data = data.dropna(subset=['Same_ofiice_home_location','RM_job_opportunities'])
data['Same_ofiice_home_location'] = data['Same_ofiice_home_location'].astype(int)  # this column was coming as float
data['RM_job_opportunities'] = data['RM_job_opportunities'].astype(int) # this column was coming as float


data.dtypes

## Step 6: Split data into train and split data

In [None]:
x=data[['Age','Same_ofiice_home_location','kids','RM_save_money','RM_quality_time','RM_better_sleep','RM_professional_growth','RM_lazy','RM_productive','digital_connect_sufficient','RM_better_work_life_balance','RM_improved_skillset','RM_job_opportunities','Male','STRESSED']]
y=data['Target']



In [None]:
x.head()

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0) #so data can have same values
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,test_size=0.2,random_state=5)


In [None]:
data.head(3)

## Step 7: Train the model

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
logm=LogisticRegression()
logm.fit(x_train,y_train)

## Step 8: Predict WFH or WFO using test data

In [None]:
predictions=logm.predict(x_test)

## Step 9: Check the accuracy

#### This shows an accuracy of 88%. 21 values we predicted correctly and 3 values were incorrect

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print("\n")
print(confusion_matrix(y_test,predictions))

In [None]:
import math
from sklearn import metrics

#metrics to find accuracy of continous variables
print('Mean Abs value:' ,metrics.mean_absolute_error(y_test,predictions))
print('Mean squared value:',metrics.mean_squared_error(y_test,predictions))
print('root mean squared error value:',math.sqrt(metrics.mean_squared_error(y_test,predictions)))

In [None]:

y_test.shape
y_test_matrix = y_test.values.reshape(-1,1)


In [None]:
dframe=pd.DataFrame({'actual':y_test,'Predicted':predictions})
dframe.head(20)