In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [3]:
# Step 2: Load the Data
# Load the dataset
url = 'school_rating.csv'
df = pd.read_csv(url)


In [4]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


  Start Date       End Date  Data Year     District Name  Campus Number  \
0   Jan 2016  December 2016       2016        AUSTIN ISD      227901046   
1   Jan 2016  December 2016       2016        AUSTIN ISD      227901026   
2   Jan 2017  December 2017       2017  THE EXCEL CENTER      227828001   
3   Jan 2022  December 2022       2022         MANOR ISD      227907107   
4   Jan 2019  December 2019       2019        AUSTIN ISD      227901065   

               Campus Name  2 Digit ESC Region         Region Name  \
0               BURNET M S                  13  REGION 13:\nAUSTIN   
1  GRADUATION PREP ACADEMY                  13  REGION 13:\nAUSTIN   
2         THE EXCEL CENTER                  13  REGION 13:\nAUSTIN   
3           OAK MEADOWS EL                  13   REGION 13: AUSTIN   
4        SADLER MEANS YWLA                  13   REGION 13: AUSTIN   

  County Name    School Type  ... Total Number\nof Students at School  \
0      TRAVIS  Middle School  ...                      

In [5]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# For simplicity, we might drop rows with missing values
df.dropna(inplace=True)  # Alternatively, you can use df.fillna() for imputation


Start Date                              0
End Date                                0
Data Year                               0
District Name                           0
Campus Number                           0
Campus Name                             0
2 Digit ESC Region                      0
Region Name                             0
County Name                             0
School Type                             0
Grades Served                           0
Alt Ed                                  9
Total Number of Students in County     54
Total Number\nof Students at School     0
% Eco Dis                               0
Overall Grade/Rating                    0
Accountability System Conversion        0
Overall Score                          15
Charter School                          6
Street Address                          2
City                                    0
Zip Code                                2
Geocode                                 0
dtype: int64


In [6]:
# Step 5: Data Visualization
# Distribution of school ratings
sns.histplot(df['Rating'], kde=True, bins=30)
plt.title('Distribution of School Ratings')
plt.show()

# Ratings by state
plt.figure(figsize=(12, 8))
sns.boxplot(x='State', y='Rating', data=df)
plt.title('School Ratings by State')
plt.xticks(rotation=90)
plt.show()

# Relationship between Rating and Student-to-Teacher Ratio
sns.scatterplot(x='Student-Teacher Ratio', y='Rating', data=df)
plt.title('School Rating vs Student-Teacher Ratio')
plt.show()

# Ratings by School Type
plt.figure(figsize=(12, 8))
sns.boxplot(x='School Type', y='Rating', data=df)
plt.title('School Ratings by Type')
plt.xticks(rotation=90)
plt.show()


KeyError: 'Rating'

In [7]:
# Step 6: Univariate Analysis
# Distribution of Ratings
sns.histplot(df['Rating'], kde=True, bins=30)
plt.title('Distribution of Ratings')
plt.show()

# Distribution of Student-Teacher Ratio
sns.histplot(df['Student-Teacher Ratio'], kde=True, bins=30)
plt.title('Distribution of Student-Teacher Ratio')
plt.show()


KeyError: 'Rating'

In [8]:
# Step 7: Bivariate Analysis
# Rating vs Student-Teacher Ratio
sns.scatterplot(x='Student-Teacher Ratio', y='Rating', data=df)
plt.title('Rating vs Student-Teacher Ratio')
plt.show()

# Rating by School Type
plt.figure(figsize=(12, 8))
sns.boxplot(x='School Type', y='Rating', data=df)
plt.title('Rating by School Type')
plt.xticks(rotation=90)
plt.show()


ValueError: Could not interpret value `Student-Teacher Ratio` for parameter `x`

In [None]:
# Step 8: Multivariate Analysis
# Pair plot for selected features
sns.pairplot(df[['Rating', 'Student-Teacher Ratio', 'School Type']])
plt.show()


In [None]:
# Step 9: Identify and Handle Outliers
# Box plot to identify outliers in Ratings
sns.boxplot(x=df['Rating'])
plt.title('Boxplot of Ratings')
plt.show()

# Removing outliers from Ratings
Q1 = df['Rating'].quantile(0.25)
Q3 = df['Rating'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Rating'] < (Q1 - 1.5 * IQR)) | (df['Rating'] > (Q3 + 1.5 * IQR)))]

# Box plot to identify outliers in Student-Teacher Ratio
sns.boxplot(x=df['Student-Teacher Ratio'])
plt.title('Boxplot of Student-Teacher Ratio')
plt.show()


In [9]:
# Step 10: Feature Engineering
# Create additional features or normalize data if needed
# For example, normalize Student-Teacher Ratio
df['Student-Teacher Ratio Normalized'] = (df['Student-Teacher Ratio'] - df['Student-Teacher Ratio'].mean()) / df['Student-Teacher Ratio'].std()


KeyError: 'Student-Teacher Ratio'

In [10]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Ratings distribution
rating_dist = df['Rating'].describe()
print(f"Ratings Distribution:\n{rating_dist}")

# Average Rating by State
avg_rating_by_state = df.groupby('State')['Rating'].mean()
print(f"Average Rating by State:\n{avg_rating_by_state}")

# Ratings by School Type
avg_rating_by_school_type = df.groupby('School Type')['Rating'].mean()
print(f"Average Rating by School Type:\n{avg_rating_by_school_type}")

# Insights from scatter plots
print("School Ratings are influenced by factors such as Student-Teacher Ratio and School Type.")


Key Insights:


KeyError: 'Rating'

Findings:
1. School Ratings: The distribution and average ratings provide insights into overall school performance.
2. State Differences: There may be variations in ratings across different states.
3. Impact of Student-Teacher Ratio: Schools with different student-teacher ratios might have different ratings.
4. School Type Trends: Different types of schools might show distinct rating patterns.