In [None]:
# data_analysis_uk_student_visa.ipynb

# 1. Imports and Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('education-visas-datasets-mar-2025.csv')

# 2. Data Cleaning
# Remove commas in 'Grants' and convert to numeric
df['Grants'] = df['Grants'].astype(str).str.replace(',', '')
df['Grants'] = pd.to_numeric(df['Grants'], errors='coerce')
df = df.dropna(subset=['Grants'])

# 3. Filter for Tier 4 - General Student visas only
df = df[df['Visa type subgroup'] == 'Tier 4 - General Student']

# 4. Time Series: Total visas granted per quarter
df['Period'] = df['Year'].astype(str) + ' ' + df['Quarter'].astype(str)
time_series = df.groupby(['Year', 'Quarter'])['Grants'].sum().reset_index()

plt.figure(figsize=(14,6))
sns.lineplot(data=time_series, x='Year', y='Grants', hue='Quarter', marker='o')
plt.title('Total UK Student Visas Granted per Quarter')
plt.ylabel('Number of Visas')
plt.xlabel('Year')
plt.legend(title='Quarter')
plt.tight_layout()
plt.show()

# 5. By Course Level: Trends over time
course_trend = df.groupby(['Year', 'Course level'])['Grants'].sum().reset_index()
plt.figure(figsize=(14,6))
sns.lineplot(data=course_trend, x='Year', y='Grants', hue='Course level', marker='o')
plt.title('Student Visas by Course Level (Yearly)')
plt.ylabel('Number of Visas')
plt.xlabel('Year')
plt.legend(title='Course Level')
plt.tight_layout()
plt.show()

# 6. By Region: Top Regions Over Time
region_trend = df.groupby(['Year', 'Region'])['Grants'].sum().reset_index()
top_regions = region_trend.groupby('Region')['Grants'].sum().sort_values(ascending=False).head(5).index
region_trend = region_trend[region_trend['Region'].isin(top_regions)]

plt.figure(figsize=(14,6))
sns.lineplot(data=region_trend, x='Year', y='Grants', hue='Region', marker='o')
plt.title('Student Visas by Top 5 Regions (Yearly)')
plt.ylabel('Number of Visas')
plt.xlabel('Year')
plt.legend(title='Region')
plt.tight_layout()
plt.show()

# 7. Policy Change Annotation (Example: Graduate Route 2021)
plt.figure(figsize=(14,6))
sns.lineplot(data=time_series, x='Year', y='Grants', marker='o')
plt.axvline(x=2021, color='red', linestyle='--', label='Graduate Route Introduced')
plt.title('Total UK Student Visas with Policy Change Annotation')
plt.ylabel('Number of Visas')
plt.xlabel('Year')
plt.legend()
plt.tight_layout()
plt.show()

# 8. Statistical Testing: Pre- and Post-2021
pre_policy = df[df['Year'] < 2021]['Grants']
post_policy = df[df['Year'] >= 2021]['Grants']

from scipy.stats import ttest_ind
t_stat, p_val = ttest_ind(pre_policy, post_policy, nan_policy='omit')
print(f"T-statistic: {t_stat:.2f}, P-value: {p_val:.4f}")

# 9. Save cleaned data for further analysis
df.to_csv('cleaned_education_visas.csv', index=False)
