# Data Manipulation Survey Monkey Dataset

# Step : 1

In [1]:
import pandas as pd
import os

# Get the current working directory
pwd = os.getcwd()

# Read the Excel file into a DataFrame
df = pd.read_excel(pwd + "/Data - Survey Monkey Output Edited.xlsx", sheet_name="Edited_Data")

# Creating a copy of the original DataFrame
df_modified = df.copy()


# Step : 2

In [2]:
# Selecting columns to drop
drop_columns = list(df_modified.columns)[1:7]

# Displaying the columns to be dropped
drop_columns

# Dropping selected columns from the modified DataFrame
df_modified.drop(columns=drop_columns, inplace=True)


# Step : 3

The `pd.melt` function in pandas is used to reshape or unpivot a DataFrame from wide format to long format. The syntax for `pd.melt` is as follows:

```python
pd.melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None)
```

- `frame`: The DataFrame to be melted.
- `id_vars`: Columns to be retained in the melted DataFrame (the identifier variables).
- `value_vars`: Columns to be melted (the measured variables).
- `var_name`: Name of the variable column. Default is 'variable'.
- `value_name`: Name to be used for the column that contains values. Default is 'value'.
- `col_level`: If columns are MultiIndex, use this level to melt.

Here's an example:

```python
import pandas as pd

# Sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Math': [90, 85, 78],
        'English': [88, 76, 92],
        'Science': [95, 89, 84]}
df = pd.DataFrame(data)

# Using pd.melt to reshape the DataFrame
melted_df = pd.melt(df, id_vars='Name', value_vars=['Math', 'English', 'Science'],
                    var_name='Subject', value_name='Score')

print(melted_df)
```

This will melt the DataFrame, and the resulting DataFrame (`melted_df`) will have columns 'Name', 'Subject', and 'Score'. The 'Subject' column will contain the original column names ('Math', 'English', 'Science'), and the 'Score' column will contain the corresponding values.

In [3]:
# Selecting columns to be melted (values)
value_vars = df_modified.columns[8:]

# Selecting columns to be kept as identifiers
id_vars = df_modified.columns[:8]

# Using pd.melt to reshape the DataFrame
# - id_vars: Columns to be kept as identifiers
# - value_vars: Columns to be melted (reshaped)
# - var_name: Renames the variable column to 'Question + Subquestion'
# - value_name: Renames the value column to 'Answer'
# - col_level: Not specified, defaults to None
df_melted = pd.melt(df_modified, id_vars=id_vars, value_vars=value_vars, var_name='Question + Subquestion', value_name='Answer', col_level=None)

# Display the reshaped DataFrame
df_melted.head(2)


Unnamed: 0,Respondent ID,Identify which division you work in. - Response,Identify which division you work in. - Other (please specify),Which of the following best describes your position level? - Response,Which generation are you apart of? - Response,Please select the gender in which you identify. - Response,Which duration range best aligns with your tenure at your company? - Response,Which of the following best describes your employment type? - Response,Question + Subquestion,Answer
0,5379192392,Infrastructure,,Staff,Generation X (born between 1965-1980),Male,0-2 years,Full time Employee,Question 1 - Response,
1,2658722536,Finance,,Staff,,,10+ years,Full time Employee,Question 1 - Response,Answer 4


# Step : 4

In [4]:
# Reading the 'Question' sheet from the Excel file into a DataFrame
question_imported = pd.read_excel(pwd + "/Data - Survey Monkey Output Edited.xlsx", sheet_name="Question")

# Creating a copy of the DataFrame for further processing
question = question_imported

# Dropping unnecessary columns from the DataFrame
# - "Raw Question": Original question text
# - "Raw Subquestion": Original subquestion text
# - "Subquestion": Processed subquestion text
question.drop(columns=["Raw Question", "Raw Subquestion", "Subquestion"], inplace=True)

# Displaying the first two rows of the processed 'question' DataFrame
question.head(2)


Unnamed: 0,Question,Question + Subquestion
0,Respondent ID,Respondent ID
1,Start Date,Start Date


# Step : 5

In [5]:
# Merging the melted DataFrame 'df_melted' with the 'question' DataFrame on the 'Question + Subquestion' column
df_merged = pd.merge(left=df_melted, right=question, how='left', left_on='Question + Subquestion', right_on='Question + Subquestion')

# Checking the length of the merged DataFrame and the original melted DataFrame
# Note: Both datasets must have the same length for proper merging
len_df_merged = len(df_merged)
len_df_melted = len(df_melted)

# Creating a subset 'respondents' from the merged DataFrame where the 'Answer' column is not null
respondents = df_merged[df_merged["Answer"].notna()]

# Grouping the 'respondents' DataFrame by "Question" and counting unique Respondent IDs
respondents = respondents.groupby(["Question"])["Respondent ID"].nunique().reset_index()

# Renaming the column to 'Respondents'
respondents.rename(columns={"Respondent ID": "Respondents"}, inplace=True)

# Displaying the 'respondents' DataFrame with counts of unique respondents for each question
respondents


Unnamed: 0,Question,Respondents
0,Question 1,119
1,Question 10,198
2,Question 11,164
3,Question 12,114
4,Question 13,108
5,Question 14,105
6,Question 15,114
7,Question 16,117
8,Question 17,135
9,Question 18,109


# Step : 6

In [6]:
# Merging the previously merged DataFrame 'df_merged' with the 'respondents' DataFrame on the 'Question' column
df_merged_two = pd.merge(left=df_merged, right=respondents, how='left', left_on='Question', right_on='Question')

# Checking the length of the second merged DataFrame and the original melted DataFrame
# Note: Both datasets must have the same length for proper merging
len_df_merged_two = len(df_merged_two)
len_df_melted = len(df_melted)

# Displaying the second merged DataFrame, which includes additional information about unique respondents per question
df_merged_two

# Creating a subset 'same_answer' from the second merged DataFrame where the 'Answer' column is not null
same_answer = df_merged_two[df_merged_two["Answer"].notna()]

# Grouping the 'same_answer' DataFrame by "Question + Subquestion" and "Answer", counting unique Respondent IDs
same_answer = same_answer.groupby(["Question + Subquestion", "Answer"])["Respondent ID"].nunique().reset_index()

# Renaming the column to 'Same Answer'
same_answer.rename(columns={"Respondent ID": "Same Answer"}, inplace=True)

# Displaying the 'same_answer' DataFrame with counts of unique respondents for each question and answer combination
same_answer


Unnamed: 0,Question + Subquestion,Answer,Same Answer
0,Question 1 - Response,Answer 1,14
1,Question 1 - Response,Answer 2,10
2,Question 1 - Response,Answer 3,13
3,Question 1 - Response,Answer 4,17
4,Question 1 - Response,Answer 5,22
...,...,...,...
683,Question 9 - Response 4,Answer 4,16
684,Question 9 - Response 4,Answer 5,13
685,Question 9 - Response 4,Answer 6,14
686,Question 9 - Response 4,Answer 7,12


# Step : 7

In [7]:
# Merging the previously merged DataFrame 'df_merged' with the 'respondents' DataFrame on the 'Question' column
df_merged_three = pd.merge(left=df_merged_two, right=same_answer, how='left', left_on=["Question + Subquestion", "Answer"], right_on=["Question + Subquestion", "Answer"])

# Correcting the chained assignment warning by using the original DataFrame and avoiding inplace
df_merged_three["Same Answer"] = df_merged_three["Same Answer"].fillna(0)

# Checking the length of the second merged DataFrame and the original melted DataFrame
# Note: Both datasets must have the same length for proper merging
len_df_merged_two = len(df_merged_three)
len_df_merged_two_original = len(df_merged_two)

# Displaying the second merged DataFrame, which includes additional information about unique respondents per question
df_merged_three


Unnamed: 0,Respondent ID,Identify which division you work in. - Response,Identify which division you work in. - Other (please specify),Which of the following best describes your position level? - Response,Which generation are you apart of? - Response,Please select the gender in which you identify. - Response,Which duration range best aligns with your tenure at your company? - Response,Which of the following best describes your employment type? - Response,Question + Subquestion,Answer,Question,Respondents,Same Answer
0,5379192392,Infrastructure,,Staff,Generation X (born between 1965-1980),Male,0-2 years,Full time Employee,Question 1 - Response,,Question 1,119,0.0
1,2658722536,Finance,,Staff,,,10+ years,Full time Employee,Question 1 - Response,Answer 4,Question 1,119,17.0
2,4044163394,Infrastructure,,Department Lead,Generation X (born between 1965-1980),Male,3-5 years,Full time Employee,Question 1 - Response,Answer 5,Question 1,119,22.0
3,5535865599,Infrastructure,,Manager,Millennial (born between 1981-2000),Non-Binary,5-10 years,Full time Employee,Question 1 - Response,Answer 1,Question 1,119,14.0
4,3356802928,Port Operations,,Manager,Generation X (born between 1965-1980),Female,10+ years,Full time Employee,Question 1 - Response,,Question 1,119,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17023,7940065082,Infrastructure,,Department Lead,Baby Boomer (born between 1946-1964),Male,10+ years,Full time Employee,Question 30 - Response 3,Answer 8,Question 30,182,14.0
17024,5157705612,Finance,,Staff,Millennial (born between 1981-2000),Female,5-10 years,Full time Employee,Question 30 - Response 3,Answer 6,Question 30,182,20.0
17025,9920755555,Port Operations,,Staff,Millennial (born between 1981-2000),Female,3-5 years,Full time Employee,Question 30 - Response 3,,Question 30,182,0.0
17026,6638341389,Infrastructure,,Manager,Millennial (born between 1981-2000),Female,3-5 years,Full time Employee,Question 30 - Response 3,,Question 30,182,0.0


# Step : 8

In [8]:
old_col = list(df_merged_three.columns[1:8])
new_col = ["Division Primary", "Division Secondary", "Position", "Generation", "Gender", "Tenure", "Employment Type"]

# Creating a dictionary for renaming columns
rename_dict = dict(zip(old_col, new_col))

# Creating a copy of the original DataFrame
output = df_merged_three.copy()

# Renaming columns using the created dictionary
output.rename(columns=rename_dict, inplace=True)

# Displaying the DataFrame with renamed columns
output


Unnamed: 0,Respondent ID,Division Primary,Division Secondary,Position,Generation,Gender,Tenure,Employment Type,Question + Subquestion,Answer,Question,Respondents,Same Answer
0,5379192392,Infrastructure,,Staff,Generation X (born between 1965-1980),Male,0-2 years,Full time Employee,Question 1 - Response,,Question 1,119,0.0
1,2658722536,Finance,,Staff,,,10+ years,Full time Employee,Question 1 - Response,Answer 4,Question 1,119,17.0
2,4044163394,Infrastructure,,Department Lead,Generation X (born between 1965-1980),Male,3-5 years,Full time Employee,Question 1 - Response,Answer 5,Question 1,119,22.0
3,5535865599,Infrastructure,,Manager,Millennial (born between 1981-2000),Non-Binary,5-10 years,Full time Employee,Question 1 - Response,Answer 1,Question 1,119,14.0
4,3356802928,Port Operations,,Manager,Generation X (born between 1965-1980),Female,10+ years,Full time Employee,Question 1 - Response,,Question 1,119,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17023,7940065082,Infrastructure,,Department Lead,Baby Boomer (born between 1946-1964),Male,10+ years,Full time Employee,Question 30 - Response 3,Answer 8,Question 30,182,14.0
17024,5157705612,Finance,,Staff,Millennial (born between 1981-2000),Female,5-10 years,Full time Employee,Question 30 - Response 3,Answer 6,Question 30,182,20.0
17025,9920755555,Port Operations,,Staff,Millennial (born between 1981-2000),Female,3-5 years,Full time Employee,Question 30 - Response 3,,Question 30,182,0.0
17026,6638341389,Infrastructure,,Manager,Millennial (born between 1981-2000),Female,3-5 years,Full time Employee,Question 30 - Response 3,,Question 30,182,0.0


# Step : 9

In [9]:
import os

# Get the current working directory
pwd = os.getcwd()

# Save the DataFrame to an Excel file
output.to_excel(pwd + "/Final_Output.xlsx", index=False)


# Done Finally 