In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read data
data = pd.read_csv("../input/acquisitions.csv")
# Check the shape of data
data.shape

In [None]:
# List all columns
data.columns

In [None]:
# Also, let us view first 5 row entries to see how the data looks like.
data.head()

In [None]:
# Also, a quick glance at last five row entries
data.tail()

In [None]:
# Let's check column types and some statistics info
data.info()

AcquisitionMonthDate and Value are float64 type. We will convert them to numeric type later.

**Missing values - **

We can see that all columns do not have equal number of entries, which implies missing values.
Missing values must be handled in the data by either -
1. Dropping them - which is better only if there are very few entries missing and dataset is large enough.
2. Imputing them - replacing with zero, mean, median or mode of the attribute for numeric types and with "None" 
    for categorical type.

We will find exactly how many missing values exist in each column.


In [None]:
data.isnull().sum()

Above output shows there are -
*  6 missing values in AcquisitionMonth
* 33 missing values in AcquisitionMonthDate
* 6 missing values in Country
* 671 missing values in Value (USD)
* 515 missing values in Derived products

In [None]:
# ------- First update missing vals for MonthDate Column -----------
# Let us find all distinct possible values for MonthDate column
print(np.unique(data['AcquisitionMonthDate']))

# Since, it is float64 type (we saw earlier), we will convert it to numeric type first.
data['AcquisitionMonthDate'] = pd.to_numeric(data['AcquisitionMonthDate'])

# Replace missing values for this column with mode
data['AcquisitionMonthDate'] = data['AcquisitionMonthDate'].fillna(data['AcquisitionMonthDate'].mode()[0])

In [None]:
# ------- Next update missing vals for Country Column -----------
# Find distinct country names for Country column
print(set(data['Country']))

# Let us replace null values with empty string
data['Country'] = data['Country'].fillna('Other')

# Check again for missing data
data['Country'].isnull().sum()

In [None]:
# ------- First update missing vals for Derived products Column -----------
print(set(data['Derived products'])) # We see nan in entries

# Replace null with empty string.
data['Derived products'] = data['Derived products'].fillna(' ')

# Check for missing values now
print(data['Derived products'].isna().sum())

**Exploratory Data Analysis (EDA)**

* Till now we have been cleaning the data, which is not necessarily required if we just want to analyze our data.
* However, if we plan on generating models from our data, this will be* an essential and most time consuming* step in the process.

Let us start exploring the data to answer some questions -
1. Yearly acquisitions by different companies (Also, to find years with most acquisitions)
2. Find if there are any specific months in years with maximum number of acquistions (to find existing monthly 
     trends)
3. Companies with minimum and maximum number of acquisitions.  
4. Most dominant business sector - overall and countrywise.
5. Expenditure by each company on acquisitions.


In [None]:
# Total number of acquisitions in each year
for col in ('AcquisitionMonth','AcquisitionYear','Country'):
    plt.figure(figsize = (16,6))
    data[col].value_counts(sort = False).plot.bar(grid = True) 
    plt.xlabel(str(col) + " -->")
    plt.title("Acquisitions as per each {}".format(col))
    plt.show()

**Months, Years and Counries with highest and lowest number of acquisitions - **

* From Monthly plot - It is clearly observed that June followed by July and April have most acquisitions compared to rest of the year.
*  From yearly plot -
* It is seen that maximum number of companies were acquired in 2014 while drastically reducing the  number in the following years. 

* Minimum number of acquisitions happened in the beginning years from 1987 to 1992, which is understood due to those being the budding years of some of the major market giants.

* From Countrywise plot -
Companies in United States are extremely highly acquired with more than six times compared to rest of the world followed next by those in Canada.
**An important point to notes that, this column had several missing values which we replaced with empty string. Those countries account for a considerable number of acquisitions, however, nothing can be said about which country/countries contribute to this count. 

In [None]:
# Total number of acquisitions by each company
plt.figure(figsize = (12,4))
data['ParentCompany'].value_counts(sort = False).plot.bar(grid = True) 
plt.xlabel("Parent Company ->")
plt.title("Parent companies with their respective no. of acquisitions")
plt.show()

**Most dominant acquirers**

* Google is the most dominant acquirer with more that 200 acquisitions in its pocket.
* Microsoft is the runner up but even it acquired more than 200 overall acquisitions.
* The least dominant acquirer is Twitter with about 50+ acquisitions till 2018, which is still a good number considering it was founded only in 2006.

In [None]:
# Most and least dominant business sectors
plt.figure(figsize = (8,8))
buss = data['Business'].value_counts(sort = True)
buss[:10].plot.bar(grid = True) 
plt.xlabel("Businesses ->")
plt.title("Most dominant businesses")
plt.show()
plt.figure(figsize = (8,8))
buss[::-140].plot.bar(grid = True) 
plt.xlabel("Businesses ->")
plt.title("Least dominant businesses")
plt.show()

In [None]:
import plotly.offline as py                #visualization
py.init_notebook_mode(connected = True)    #visualization
import plotly.graph_objs as go             #visualization
import plotly.tools as tls                 #visualization
import plotly.figure_factory as ff         #visualization

data[['Country', 'Value (USD)','ParentCompany', 'Business', 'Company']]
def Scatterplot(par_com,color) :
    tracer = go.Bar(x = data[data["ParentCompany"] == par_com]["Company"],
                    y = data[data["ParentCompany"] == par_com]["Value (USD)"]
                        )
    return tracer

def layout_title(title) :
    layout = go.Layout(dict(title = title,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "Company",
                                         zerolinewidth = 1,ticklen = 5,gridwidth = 2),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "Value (in USD)",
                                         zerolinewidth = 1,ticklen = 5,gridwidth = 2),
                            height = 700
                           )
                      )
    return layout

for comp in (set(data.ParentCompany)):
    trace = Scatterplot(comp,'red')
    data1 = [trace]
    layout1  = layout_title(comp)
    fig1 = go.Figure(data = data1,layout = layout1)
    py.iplot(fig1)

[A good kernel to read](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction)
