In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("D:\Projects\CSV\survey_results_public.csv")  # Read data from a CSV file

In [None]:
df

In [None]:
df.dropna()  # Remove rows with missing values
df.duplicated()  # Check for duplicate rows
df.drop_duplicates()  # Remove duplicate rows


In [None]:
numeric_columns=df.select_dtypes(include=np.number) # type: ignore
mean_value=numeric_columns.mean()
print(mean_value)
df.head()  # Display the first few rows of the DataFrame
df.shape  # Get the dimensions of the DataFrame (rows, columns)
df.info()  # Display the summary information of the DataFrame
df.describe()  # Generate descriptive statistics of the DataFrame
df.columns  # Get the column names of the DataFrame


In [None]:
# Create a new column based on existing columns
df['TotalYearsCode'] = df['YearsCode'] + df['YearsCodePro']

In [None]:
# Group data based on a column
grouped_data = df.groupby('Country')

In [None]:
print(grouped_data)

In [None]:
# Sort DataFrame by values in a column
sorted_data = df.sort_values('ConvertedCompYearly', ascending=False)


In [None]:
# Fill missing values with a specified value
df['CompTotal'] = df['CompTotal'].fillna(0)

In [None]:
# Convert 'Age' column to numeric type
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
# Perform statistical operations
std_dev = df['ConvertedCompYearly'].std()
min_value = df['Age'].min()

In [None]:
# Convert 'CompTotal' and 'YearsCodePro' columns to numeric type
df['CompTotal'] = pd.to_numeric(df['CompTotal'], errors='coerce')
df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')

# Perform arithmetic operations on columns
df['TotalComp'] = df['CompTotal'] * df['YearsCodePro']

In [None]:
# Display the updated DataFrame
print(df[['CompTotal', 'YearsCodePro', 'TotalComp']])

In [None]:
df['TotalComp'].fillna(0, inplace=True)

In [None]:
print(df[['CompTotal', 'YearsCodePro', 'TotalComp']])

In [None]:
df = df.rename(columns={'ConvertedCompYearly': 'Salary'})

In [None]:
df.info()

In [None]:
# Are you more likely to get a job as a developer if you have a master's degree?
edlevel_job_counts = df.loc[df['EdLevel'] == "Master’s degree (MA, MS, M.Eng., MBA, etc.)", 'Employment'].value_counts()

if not edlevel_job_counts.empty:
    plt.figure(figsize=(12, 6))
    sns.barplot(data=edlevel_job_counts, x=edlevel_job_counts.index, y=edlevel_job_counts.values)
    plt.title('Employment Status for Developers with a Master\'s Degree')
    plt.xlabel('Employment Status')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No data available for developers with a master's degree.")



In [None]:
# Are you more likely to get a job as a developer if you have a master's degree?
edlevel_job_counts = df.loc[df['EdLevel'] == "Master’s degree (MA, MS, M.Eng., MBA, etc.)", 'Employment'].value_counts()

print(edlevel_job_counts)  # Print the DataFrame for debugging purposes

if not edlevel_job_counts.empty:
    plt.figure(figsize=(12, 6))
    sns.barplot(data=edlevel_job_counts, x=edlevel_job_counts.index, y=edlevel_job_counts.values)
    plt.title('Employment Status for Developers with a Master\'s Degree')
    plt.xlabel('Employment Status')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No data available for developers with a master's degree.")


In [None]:
# Print unique values in the 'EdLevel' column
print(df['EdLevel'].unique())

# Are you more likely to get a job as a developer if you have a master's degree?
edlevel_job_counts = df.loc[df['EdLevel'] == "Master’s degree (MA, MS, M.Eng., MBA, etc.)", 'Employment'].value_counts()

print(edlevel_job_counts)  # Print the DataFrame for debugging purposes

if not edlevel_job_counts.empty:
    plt.figure(figsize=(12, 6))
    sns.barplot(data=edlevel_job_counts, x=edlevel_job_counts.index, y=edlevel_job_counts.values)
    plt.title('Employment Status for Developers with a Master\'s Degree')
    plt.xlabel('Employment Status')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No data available for developers with a master's degree.")


We use the "seaborn" library to set the plot style to a white grid background. We also specify a color palette ('viridis') for the bars. Additionally, value labels are added to each bar using the "plt.text()" function to display the average salary on top of each bar. Adjust the font sizes, rotation, and other style parameters as desired to further customize the appearance of the graph.

In [None]:
# Group the data by country and calculate the average salary
avg_salary_by_country = df.groupby('Country')['Salary'].mean().reset_index()

# Sort the data by average salary in descending order
avg_salary_by_country = avg_salary_by_country.sort_values('Salary', ascending=False)

# Select the top 12 countries with highest average salary
top_12_countries = avg_salary_by_country.head(12)

# Set up the plot style using Seaborn
sns.set(style='whitegrid')

# Create the bar plot
plt.figure(figsize=(12, 9))
sns.barplot(x='Country', y='Salary', data=top_12_countries, palette='viridis')
plt.xlabel('Country', fontsize=12)
plt.ylabel('Average Salary', fontsize=12)
plt.title('Top 12 Countries by Average Salary', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)

# Add data labels on top of each bar
for i, v in enumerate(top_12_countries['Salary']):
    plt.text(i, v, f"${v:.2f}", ha='center', va='bottom', fontsize=10)
    
    
# Add a horizontal line at the average salary across all countries
avg_salary_all_countries = df['Salary'].mean()
plt.axhline(y=avg_salary_all_countries, color='red', linestyle='--', linewidth=1)



plt.tight_layout()
plt.show()


The code to create a pie chart representing the distribution of remote work types, including "NA," and with customizations for a visually appealing plot:

In [None]:
# Count the occurrences of each remote work type, including "NA"
remote_work_counts = df['RemoteWork'].value_counts(dropna=False)

# Create a pie chart
plt.figure(figsize=(8, 8))
colors = ['#5DADE2', '#58D68D', '#F4D03F', '#F39C12', '#CACFD2']  # Custom colors for each slice, including "NA"
explode = (0.1, 0, 0, 0, 0)  # Explode the first slice (optional), including "NA"
explode = explode[:len(remote_work_counts)]  # Truncate the explode tuple if it's longer than remote_work_counts
patches, texts, autotexts = plt.pie(remote_work_counts, labels=remote_work_counts.index, autopct='%1.1f%%', startangle=90, # type: ignore
                                    colors=colors, explode=explode, shadow=True, wedgeprops={'linewidth': 1, 'edgecolor': 'white'})

# Add count labels to the pie chart
for i, text in enumerate(autotexts):
    count = remote_work_counts[i]
    percentage = count / remote_work_counts.sum() * 100
    text.set_text(f'{count} ({percentage:.1f}%)')

# Customize the plot
plt.title('Remote Work Distribution', fontsize=14, fontweight='bold')
plt.axis('equal')

plt.tight_layout()
plt.show()


To analyze the relationship between coding experience (represented by the "YearsCodePro" column) and salary, you can create a bar chart showing the median salary for each level of coding experience.
We calculate the median salary for each level of coding experience using the median() function instead of the mean(). Then, we create a bar chart showing the median salary for each level of coding experience.

In [None]:
# Group the data by the years of professional coding experience and calculate the median salary
median_salary_by_experience = df.groupby('YearsCodePro')['Salary'].median().reset_index()

# Sort the data by years of professional coding experience
median_salary_by_experience = median_salary_by_experience.sort_values('YearsCodePro')

# Set up the plot style using Seaborn and a custom color palette
plt.figure(figsize=(18, 6))
sns.set(style='whitegrid')
colors = sns.color_palette('Set2', len(median_salary_by_experience))
sns.barplot(x='YearsCodePro', y='Salary', data=median_salary_by_experience, palette=colors)

# Customize the plot
plt.xlabel('Years of Professional Coding Experience', fontsize=12, labelpad=10)
plt.ylabel('Median Salary', fontsize=12)
plt.title('Median Salary by Coding Experience', fontsize=14)

plt.xticks(rotation=0, ha='right', fontsize=10)
plt.yticks(fontsize=10)

# Remove decimal from x-axis labels
plt.xticks(range(len(median_salary_by_experience)), median_salary_by_experience['YearsCodePro'].astype(int))

# Adjust the position of the x-axis ticks
plt.subplots_adjust(left=0.5)

plt.tight_layout()
plt.show()


This code will generate a bar chart showing the distribution of different learning methods, with the count labels added to each bar. The most popular method will be indicated, and it will also be printed as output.

In [None]:
# Count the occurrences of each method of learning to code
learning_methods_counts = df['LearnCode'].value_counts()

# Get the most popular method
most_popular_method = learning_methods_counts.idxmax()

# Create a bar chart to visualize the distribution of learning methods
plt.figure(figsize=(10, 6))
learning_methods_counts.plot(kind='bar', color='blue')

# Customize the plot
plt.xlabel('Learning Method', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Most Popular Method of Learning to Code', fontsize=14)

plt.xticks(rotation=45, ha='right', fontsize=10)

# Add the count labels to the bars
for i, count in enumerate(learning_methods_counts):
    plt.text(i, count, str(count), ha='center', va='bottom')

# Adjust the margins to provide more space
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.3)

plt.show()

print(f"The most popular method of learning to code is: {most_popular_method}")


In [None]:
# Count the occurrences of each method of learning to code
learning_methods_counts = df['LearnCode'].value_counts()

# Select the top 12 methods
top_12_methods = learning_methods_counts[:12]

# Get the most popular method among the top 12
most_popular_method = top_12_methods.idxmax()

# Create a bar chart to visualize the distribution of the top 12 learning methods
plt.figure(figsize=(10, 6))
sns.barplot(x=top_12_methods.index, y=top_12_methods.values, palette='viridis')

# Customize the plot
plt.xlabel('Learning Method', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Most Popular Method of Learning to Code (Top 12)', fontsize=14)

plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--')

# Add the count labels to the bars
for i, count in enumerate(top_12_methods):
    plt.text(i, count, str(count), ha='center', va='bottom', fontsize=10)

plt.show()


To analyze whether having a master's degree increases the likelihood of getting a job as a developer, you can compare the employment status of developers with and without a master's degree. Here's a step-by-step approach to perform the analysis:

1. Filter the dataset to include only rows where the MainBranch indicates being a developer by profession.
2. Create a new column HasMastersDegree that indicates whether the individual has a master's degree or not.
3. Group the data by the HasMastersDegree column and calculate the count of each employment status category.
4. Visualize the employment status distribution using a bar chart.

In [None]:
# Filter the dataset to include only rows where MainBranch indicates being a developer by profession
df_developers = df[df['MainBranch'] == 'I am a developer by profession'].copy()

# Create a new column indicating whether the individual has a master's degree or not
df_developers['HasMastersDegree'] = df_developers['EdLevel'].apply(
    lambda x: 'Yes' if x == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)" else 'No')

# Group the data by HasMastersDegree and calculate the count of each employment status category
employment_counts = df_developers['Employment'].value_counts().reset_index()

# Rename the columns for clarity
employment_counts.columns = ['Employment Status', 'Count']

# Sort the employment counts in descending order
employment_counts = employment_counts.sort_values('Count', ascending=False)

# Select only the top 12 employment status categories
top_12_employment_counts = employment_counts.head(12)

# Count developers without a master's degree
developers_without_masters = df_developers[df_developers['HasMastersDegree'] == 'No']
developers_without_masters_count = len(developers_without_masters)

# Create a DataFrame for developers without a master's degree
no_masters_df = pd.DataFrame({'Employment Status': ['No Master\'s Degree'], 'Count': [developers_without_masters_count]})

# Concatenate the top 12 employment counts DataFrame with the DataFrame for developers without a master's degree
top_12_employment_counts = pd.concat([top_12_employment_counts, no_masters_df], ignore_index=True)

# Create a bar chart to visualize the top 12 employment status distribution
plt.figure(figsize=(10, 6))
sns.barplot(x='Employment Status', y='Count', data=top_12_employment_counts, palette='viridis')

# Customize the plot
plt.xlabel('Employment Status', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Top 12 Employment Status of Developers with and without a Master\'s Degree', fontsize=14)

plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--')

# Add count labels to the bars
for i, count in enumerate(top_12_employment_counts['Count']):
    plt.text(i, count, str(count), ha='center', va='bottom', fontsize=10)

plt.show()


In [None]:
# Concatenate the two language columns
language_combined = df['LanguageHaveWorkedWith'].str.cat(df['LanguageWantToWorkWith'], sep=';')

# Split the concatenated column by delimiter ';' and explode into multiple rows
languages = language_combined.str.split(';').explode()

# Count the frequency of each language
language_counts = languages.value_counts().head(8)

# Create a bar plot of the top 8 most popular languages
plt.figure(figsize=(10, 6))
sns.barplot(x=language_counts.index, y=language_counts.values, palette='viridis')

plt.title('Top 8 Most Popular Languages')
plt.xlabel('Language')
plt.ylabel('Count')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add padding to x-axis title
xlabel_obj = plt.xlabel('Languages')
plt.gca().xaxis.set_label_coords(0.5, -0.25)
xlabel_obj.set_fontweight('bold')
plt.xlabel('Language', labelpad=10)

plt.tight_layout()
plt.show()

To determine the top 8 most used languages, we can use the column 'LanguageHaveWorkedWith' or 'LanguageWantToWorkWith'. To plot the top 8 most popular languages in a visually appealing manner, you can use a bar plot with customized formatting options.
To add padding to the x-axis title and enhance the aesthetics of the plot, you can use the set_xlabel function from matplotlib to access the xlabel object and set various formatting options.

In [None]:
# Concatenate the two language columns
language_combined = df['LanguageHaveWorkedWith'].str.cat(df['LanguageWantToWorkWith'], sep=';')

# Split the concatenated column by delimiter ';' and explode into multiple rows
languages = language_combined.str.split(';').explode()

# Count the frequency of each language
language_counts = languages.value_counts().head(8)

# Calculate the average count
average_count = language_counts.mean()

# Create a bar plot of the top 8 most popular languages
plt.figure(figsize=(10, 6))
sns.barplot(x=language_counts.index, y=language_counts.values, palette='viridis')

# Add an average line
plt.axhline(average_count, color='red', linestyle='--', label='Average')

plt.title('Top 8 Most Popular Languages')
plt.xlabel('Language')
plt.ylabel('Count')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add padding to x-axis title
xlabel_obj = plt.xlabel('Languages')
plt.gca().xaxis.set_label_coords(0.5, -0.25)
xlabel_obj.set_fontweight('bold')
plt.xlabel('Language', labelpad=10)

plt.legend()

plt.tight_layout()
plt.show()
