In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Welcome to my third kernel!
I will try to make an exploratory data analysis in a dataset of college football team stats from 2013 to 2020. Please give me your feedback, i will be glad to read it! If you like the kernel, please vote up.

I am sorry if there is any grammar errors, because english is not my first language.

# Content

1. [Reading Data](#Reading+Data)
2. [Exploratory Data Analysis](#Exploratory+Data+Analysis)

## Reading Data

In [None]:
datasets = os.listdir("/kaggle/input/college-football-team-stats-2019/")

df = pd.DataFrame()
for dataset in datasets:
    if dataset[-3:] == "csv":
        aux = pd.read_csv("/kaggle/input/college-football-team-stats-2019/"+dataset)
        aux["Year"] = int("20"+dataset[3:-4])
        df = df.append(aux)
    else:
        pass

In [None]:
df.shape

In [None]:
df.head()

### Touchdows and Total.TDs columns

There is two columns about the total number of touchdowns, which were called *Touchdowns* and *Total.TDs*. The *Total.TDs* columns only had values for the first three years, therefore I chose to drop it and stay with the *Touchdowns* column.

In [None]:
df.drop("Total.TDs", axis = 1, inplace = True)

### Teams and Conference

Lets separate the *Teams* column into the name of the university football team and its conference.

In [None]:
teams = df["Team"].tolist()
result = []
for team in teams:
    aux = team.split(" (")
    team = aux[0]
    conference = aux[-1].strip(")")
    result.append([team, conference])

df[["Team", "Conference"]] = result

Note that there is three observations without a conference assigned. They are Ole Miss in 2013 and 2014, which belonged to Southeastern Conference (SEC) and Pittsburgh in 2014, which was from Atlantic Coast Conference (ACC).

In [None]:
df["Conference"].unique()

In [None]:
df[df["Conference"] == ""]

### Time of possession columns

Lets change the time of possesion columns type so it is float, not object.

In [None]:
def get_time(row):
    
    cols = ["Time.of.Possession", "Average.Time.of.Possession.per.Game"]
    
    for col in cols:
        time = row[col].split(":")
        minutes = float(time[0])
        seconds = float(time[1])

        time = minutes + seconds/60
        row[col] = time
    
    return row

df = df.apply(get_time, axis = 1)

I know there is still problems in the data, an example was shown in the *Teams and Conference* segment. I will assert these problems as I progress in the data analysis.

## Exploratory Data Analysis

My objective is to answer the following four questions:

* 1. What features translate into wins?
* 2. Are special teams of particular value for a team's performance? 
* 3. Which Collegiate Conference is the best?
* 4. What's the correlation between offensive and defensive performance?

I would like to answer the question: *Does defense really does win championships?* But there is no information about teams that were invited to play in the National Championships or other Bowls. 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### 1. What features translate into wins?

In [None]:
aux = df.copy()
aux.drop(["Team", "Conference", "Games", "Loss"], axis = "columns",inplace = True)

Top 10 featuers with largest pearson correlation coeficient with the *Win* column.

In [None]:
aux.corr(method = "pearson").iloc[:,0].nlargest(n = 11)[1:]

Top 10 featuers with largest spearman correlation coeficient with the *Win* column.

In [None]:
aux.corr(method = "spearman").iloc[:,0].nlargest(n = 11)[1:]

It seems like having a team that scores lots of touchdows is the best option to win a game.

### 2. Are special teams of particular value for a team's performance?

I will use some features to mesure speacial team offensive strength, which will be *Kickoff.Return.Rank*, *Punt.Return.Rank* and *Feild.Goals*. I will also measure special teams deffensive strength with the following features: *Punt.Return.Def.Rank*, *Kickoff.Return.Def.Rank* and *Opp.Feild.Goals.Made*.

In [None]:
offensive = ["Kickoff.Return.Rank", "Punt.Return.Rank", "Feild.Goals"]
defensive = ["Punt.Return.Def.Rank", "Kickoff.Return.Def.Rank", "Opp.Feild.Goals.Made"]

In [None]:
df.corr(method = "pearson").loc["Win", offensive+defensive]

In [None]:
df.corr(method = "spearman").loc["Win", offensive+defensive]

A team's victory is most correlated to *Feild.Goals*, which I assume is the number of field goals the team had during all the games.
The least correlated is the number of field goals made by the opponent team, *Opp.Feild.Goals.Made*.

### 3. Which Collegiate Conference is the best?

Lets first get the win percentage of each team for each season

In [None]:
def win_percentage(row):

    if row["Games"] > 0:
        row["Win.Percentage"] = row["Win"]/row["Games"]
    else:
         row["Win.Percentage"] = 0
            
    return row

df = df.apply(win_percentage, axis = 1)

Lets correct observations that does not have Conference values

In [None]:
def correct_conference(row):
    
    if row.Conference == "":
        if row.Team == "Ole Miss":
            row.Conference = "SEC"
        elif row.Team == "Pittsburgh":
            row.Conference = "ACC"
    else:
        pass
    
    return row

df = df.apply(correct_conference, axis = 1)

In [None]:
df.groupby("Conference")["Win.Percentage"].describe().loc[:, ["mean","std"]].sort_values(by = "mean", ascending = False)

Based on the previous table, I would say the SEC conference is the best one.

### 4. What's the correlation between offensive and defensive performance?

For this queston I will look at the correlation between offensive team rank and defensive team rank.

In [None]:
pearson = aux.corr(method = "pearson").loc["Off.Rank", "Def.Rank"]
spearman = aux.corr(method = "spearman").loc["Off.Rank", "Def.Rank"]

print("Pearson coefficient: {}\nSpearman coefficient: {}".format(pearson, spearman))

So it seems that they are not correlated. Lets visualize it.

In [None]:
plt.figure()
sns.jointplot(x = df["Def.Rank"], y = df["Off.Rank"], kind = "scatter",
             marginal_ticks = True)
plt.show(True)

The plot shows the same answer.