# Analysis of Battles

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Checking the dataset**

In [None]:
df=pd.read_csv('../input/game-of-thrones/battles.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

**Defender_3 and Defender_4 has only Nan values.**

**Major_death,major_capture and summer are categorical columns. It has only 0 and 1**

**In attacker_size mean is greater than median So there might be skewness towards right**

**Defender_size seems normal as mean and median are more or less same**

**We will get clear picture when we visualize the seperate columns**

In [None]:
df.columns

In [None]:
df.shape

**Missing Value function**

In [None]:
def missing_check(df):
    total = df.isnull().sum().sort_values(ascending=False)   # total number of null values
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)  # percentage of values that are null
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])  # putting the above two together
    return missing_data # return the dataframe

In [None]:
missing_check(df)

**Defender_4 and Defender_3 has 100 % of missing Values.**

**attacker_4,defender_2,attacker_3 has more than 90 % of missing values.**

**We will drop these columns**

In [None]:
df.drop(['defender_4','defender_3','attacker_4','defender_2','attacker_3'],axis=1,inplace=True)

# Numerical Variables

**Correlation**

In [None]:
df.corr()

In [None]:
import seaborn as sns
sns.pairplot(df)

**We dont find proper correlation with any of the columns.**

**Checking Skewness**

In [None]:
sns.distplot(df['attacker_size'])

**As we guessed, there is skewness towards right.**

In [None]:
sns.distplot(df['defender_size'])

**Here we see some normal distribution.**

In [None]:
sns.distplot(df['battle_number'])

**We can see battle_number is wider around the mean.**

**We have Explored all numerical variables**

**Checking for Outliers**

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(3,1,1)
sns.boxplot(x=df['attacker_size'],color='lightblue')


plt.subplot(3,1,2)
sns.boxplot(x=df['defender_size'],color='lightblue')


plt.subplot(3,1,3)
sns.boxplot(x=df['battle_number'],color='lightblue')


**Attacker size has few outliers**

# Categorical Variables

In [None]:
df.columns

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.countplot(df['year'],order=df['year'].value_counts().index)
plt.xlabel('year')
plt.ylabel('No of Battles')
plt.title('year')

**More number of battles happened in 299**

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.countplot(df['attacker_king'])
plt.xlabel('Attacking King')
plt.ylabel('No of Battles')
plt.title('Attacking King Counts')

**We can see joffrey/Tommen Baratheon has more number of attacks followed by Robb stark.**

In [None]:
plt.figure(figsize=(14,10))
sns.set(style="darkgrid")
sns.countplot(df['defender_king'],order=df['defender_king'].value_counts().index)
plt.xlabel('Defending king')
plt.ylabel('No of Battles')
plt.title('Defending King Counts')

**Robb Stark has more number if battles as defender followed by Joffrey/Tommen.**

**Clearly indicates Robb and Joffrey are attacking each other**

In [None]:
plt.figure(figsize=(14,10))
sns.set(style="darkgrid")
sns.countplot(df['battle_type'],order=df['battle_type'].value_counts().index)
plt.xlabel('Battle Type')
plt.ylabel('No of Battles')
plt.title('Battle Type')

**Pitched Battle type is preferred by most of the kings followed by Siege and Ambush.**

In [None]:
plt.figure(figsize=(7,5))
sns.set(style="darkgrid")
sns.countplot(df['major_death'],order=df['major_death'].value_counts().index)
plt.xlabel('Death')
plt.ylabel('Count')
plt.title('Death in Battle')

**As we can see the death count is less in battles**

In [None]:
plt.figure(figsize=(7,5))
sns.set(style="darkgrid")
sns.countplot(df['major_capture'],order=df['major_capture'].value_counts().index)
plt.xlabel('Capture')
plt.ylabel('Count')
plt.title('Capture in Battle')

**There is less number of captures.**

In [None]:
plt.figure(figsize=(7,5))
sns.set(style="darkgrid")
sns.countplot(df['summer'],order=df['summer'].value_counts().index)
plt.xlabel('summer')
plt.ylabel('Battles')
plt.title('Battle in summer')

**As we can see more number of battles happened during summer.**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df['attacker_outcome'],order=df['attacker_outcome'].value_counts().index)
plt.xlabel('Attacker Outcome')
plt.ylabel('Result')
plt.title('Attcker Outcome')

**We can see whoever is attacking first has won many battles**

In [None]:
plt.figure(figsize=(14,10))
sns.set(style="darkgrid")
sns.countplot(df['region'],order=df['region'].value_counts().index)
plt.xlabel('region')
plt.ylabel('Battles')
plt.title('Battle Region')

**We have left few columns to visulize which seems not providing useful insights**

# Bivariate Analysis

In [None]:

df.groupby(by='attacker_king')['attacker_size'].sum().reset_index().sort_values(['attacker_size']).head(10).plot(x='attacker_king',
                                                                                                                y='attacker_size',
                                                                                                                kind='bar',
                                                                                                                figsize=(15,5))

**Stannis Baratheon has more attacker size compared to others**

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='battle_type',data=df,hue='major_death',order=df['battle_type'].value_counts().index)

**There is no death for razing battle type .May be data is not reported for razing.**

**In siege type number of deaths are less compared to others.**

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='region',data=df,hue='major_death',order=df['region'].value_counts().index)

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='summer',data=df,hue='major_death',order=df['summer'].value_counts().index)

**In summer deaths and non-deaths are mostly equal. Hoewever when it is not summer death rates are low.**

**Riverlands has more number of deaths**

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='attacker_king',data=df,hue='battle_type',order=df['attacker_king'].value_counts().index)

**We can see Joffrey/Tommen preferred pitched battle and seige attck most of the time.**

**Robb Stark preferred ambush most of the time**

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='summer',data=df,hue='attacker_outcome')

**More number of wins happended at summer**

# Statistical Analysis

Using EDA we can see some relationships exists between the features. Let us prove that statistically.

**Summer vs Winning**

**Does Summer has effect on winning?**

**H0: Summer has no effect on winning.**

**H1: Summer has effect on winning.**

**We are using Chi-square test to compare two categorical variables**

In [None]:
from scipy.stats import ttest_1samp,ttest_ind, wilcoxon, ttest_ind_from_stats
import scipy.stats as stats 
import numpy as np

In [None]:
crosstab=pd.crosstab(df['summer'],df['attacker_outcome'])
chi,p_value,dof,expected=stats.chi2_contingency(crosstab)


In [None]:
print('P_Value:', p_value)

In [None]:
if p_value < 0.05:  # Setting our significance level at 5%
    print('Rejecting Null Hypothesis.Summer has effect on winning.')
else:
    print('Fail to Reject Null Hypothesis.Summer has no effect on winning.')

**Summer vs Death**

**Does Summer has effect on Death?**

**H0: Summer has no effect on Death.**

**H1: Summer has effect on Death.**

**We are using Chi-square test to compare two categorical variables**

In [None]:
crosstab=pd.crosstab(df['summer'],df['major_death'])
chi,p_value,dof,expected=stats.chi2_contingency(crosstab)
print('P_Value:', p_value)

In [None]:
if p_value < 0.05:  # Setting our significance level at 5%
    print('Rejecting Null Hypothesis.Summer has effect on Death.')
else:
    print('Fail to Reject Null Hypothesis.Summer has no effect on Death.')

**Winning vs Death**

**Does Winning has effect on Death?**

**H0: Winning has no effect on Death.**

**H1: Winning has effect on Death.**

**We are using Chi-square test to compare two categorical variables**

In [None]:
crosstab=pd.crosstab(df['attacker_outcome'],df['major_death'])
chi,p_value,dof,expected=stats.chi2_contingency(crosstab)
print('P_Value:', p_value)

In [None]:
if p_value < 0.05:  # Setting our significance level at 5%
    print('Rejecting Null Hypothesis.Winning has effect on Death.')
else:
    print('Fail to Reject Null Hypothesis.Winning has no effect on Death.')

# Character Death Analysis

In [None]:
df_characters=pd.read_csv('../input/game-of-thrones/character-deaths.csv')

In [None]:
df_characters.columns

In [None]:
df_characters.head()

In [None]:
df_characters.info()

In [None]:
df_characters.shape

In [None]:
df_characters.describe()

In [None]:
missing_check(df_characters)

In [None]:
import seaborn as sns
sns.pairplot(df_characters)

**Only death chapter and book intro chapter has little relation. Each individually distributed normally.**

**Rest of the features are categorical features.**

In [None]:
df_characters.columns

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['Gender'],order=df_characters['Gender'].value_counts().index)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Character Gender')

**More number of male characters compared to female.**

**Number of characters appeared in first book**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['GoT'],order=df_characters['GoT'].value_counts().index)
plt.xlabel('First Book')
plt.ylabel('Count')
plt.title('First Book Count')

**Number of characters appeared in Second book**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['CoK'],order=df_characters['CoK'].value_counts().index)
plt.xlabel('Second Book')
plt.ylabel('Count')
plt.title('Second Book Count')

**Number of characters appeared in Third Book**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['SoS'],order=df_characters['SoS'].value_counts().index)
plt.xlabel('Third Book')
plt.ylabel('Count')
plt.title('Third Book Count')

**Number of characters appeared in Fourth Book**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['FfC'],order=df_characters['FfC'].value_counts().index)
plt.xlabel('Fourth Book')
plt.ylabel('Count')
plt.title('Fourth Book Count')

**Number of characters appeared in Fifth Book**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['DwD'],order=df_characters['DwD'].value_counts().index)
plt.xlabel('Fifth Book')
plt.ylabel('Count')
plt.title('Fifth Book Count')

**As we can see more number of characters are appeared in third book**

In [None]:
plt.figure(figsize=(9,5))
sns.set(style="darkgrid")
sns.countplot(df_characters['Nobility'],order=df_characters['Nobility'].value_counts().index)
plt.xlabel('Nobility vs Commoner')
plt.ylabel('Count')
plt.title('Nobility vs Commoner')

**Commoner are more compared to nobility. But nearly both are equal**